用DOM实现文章采集--采集到网页源码
程序员文章站
2022-07-02 18:35:59
先来个采集网页的代码。
[csharp]
using system;
using system.collections.generic;
using...
先来个采集网页的代码。
[csharp]
using system;
using system.collections.generic;
using system.io;
using system.io.compression;
using system.net;
using system.text;
namespace topwincms.common
{
public class nethelper
{
//private string _http_user_agent = "mozilla/4.0+(compatible;+msie+6.0;+windows+nt+5.2;+sv1;+.net+clr+1.1.4322;+.net+clr+2.0.50727)";
private string _useragent = "googlebot/2.1 (+https://www.google.com/bot.html)";
private encoding _httpencoding = null;
private string _proxyhost = string.empty;
private int _proxyint = 8080;
private int _timeout = 200000;
#region 属性
/// <summary>
/// 设置useragent
/// </summary>
public string useragent
{
get
{
return this._useragent;
}
set
{
this._useragent = value;
}
}
/// <summary>
/// 设置编码
/// </summary>
public encoding httpencoding
{
get
{
return this._httpencoding;
}
set
{
this._httpencoding = value;
}
}
/// <summary>
/// 设置代理服务器
/// </summary>
public string proxyhost
{
get
{
return this._proxyhost;
}
set
{
this._proxyhost = value;
}
}
/// <summary>
/// 设置代理服务器端口
/// </summary>
public int proxyint
{
get
{
return this._proxyint;
}
set
{
this._proxyint = value;
}
}
/// <summary>
/// 设置默认超时时间
/// </summary>
public int timeout
{
get
{
return this._timeout;
}
set
{
this._timeout = value;
}
}
#endregion
public remoteres get(string uri)
{
return get(new uri(uri));
}
public remoteres get(uri uri)
{
remoteres info = new remoteres();
httpwebrequest request = (httpwebrequest)webrequest.create(uri);
request.timeout = this._timeout;
request.useragent = this._useragent;
request.method = "get";
request.referer = string.concat("https://", uri.host);
if (this._proxyhost.length > 0)
{
request.proxy = new webproxy(this._proxyhost, this._proxyint);
}
httpwebresponse response = null;
stream responsestream = null;
try
{
encoding encoding;
response = (httpwebresponse)request.getresponse();
responsestream = response.getresponsestream();
if (response.headers["accept-encoding"] != null)
{
if (mycollections.contain(response.headers["accept-encoding"], "*", "gzip", "x-gzip"))
{
responsestream = new gzipstream(responsestream, compressionmode.decompress);
}
}
else if (response.headers["content-encoding"] != null)
{
if (mycollections.contain(response.headers["content-encoding"], "*", "gzip", "x-gzip"))
{
responsestream = new gzipstream(responsestream, compressionmode.decompress);
}
}
if (this._httpencoding == null)
{
string str = response.characterset.tolower();
if (str.length > 3)
{
if (str.substring(0, 3) == "iso")
{
encoding = encoding.default;
}
else
{
encoding = encoding.getencoding(response.characterset);
}
}
else
{
encoding = encoding.getencoding(response.characterset);
}
if (str.length == 0)
{
encoding = encoding.utf8;
}
}
else
{
encoding = this._httpencoding;
}
info.html = new streamreader(responsestream, encoding).readtoend();
info.contenttype = response.contenttype;
info.statuscode = response.statuscode;
}
catch (webexception we)
{
if (we.response != null)
{
info.statuscode = (we.response as httpwebresponse).statuscode;
}
else
{
info.statuscode = httpstatuscode.serviceunavailable;
}
info.code = "错误:" + we.message;
}
catch (exception ex)
{
info.code = "错误:" + ex.message;
info.statuscode = httpstatuscode.internalservererror;
}
finally
{
if (responsestream != null)
responsestream.close();
if (response != null)
response.close();
}
return info;
}
#region 取得远程资源
/// <summary>
/// 取得远程资源
/// </summary>
/// <param name="strurl">要取的url</param>
/// <returns>网页源代码</returns>
public remoteres getremoteresource(string strurl)
{
httpwebresponse response = null;
stream stream = null;
remoteres info = new remoteres();
try
{
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.allowautoredirect = true;
request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506)";
request.referer = "https://" + new uri(strurl).host;
response = request.getresponse() as httpwebresponse;
stream = response.getresponsestream();
info.contenttype = response.contenttype;
memorystream ms = new memorystream();
byte[] buffer = new byte[256];
int c = stream.read(buffer, 0, buffer.length);
while (c > 0)
{
ms.write(buffer, 0, c);
c = stream.read(buffer, 0, buffer.length);
}
stream.close();
info.statuscode = response.statuscode;
info.bytes = ms.toarray();
}
catch (webexception we)
{
if (we.response != null)
{
info.statuscode = (we.response as httpwebresponse).statuscode;
}
else
{
info.statuscode = httpstatuscode.serviceunavailable;
}
return null;
}
catch
{
info.statuscode = httpstatuscode.internalservererror;
return null;
}
finally
{
if (stream != null)
stream.close();
if (response != null)
response.close();
}
return info;
}
#endregion
public remoteres post(string strurl, string postdata)
{
remoteres info = new remoteres();
stream responsestream = null;
httpwebresponse response = null;
try
{
byte[] bytes = this._httpencoding.getbytes(postdata);
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.method = "post";
request.contenttype = "application/x-www-form-urlencoded";
request.contentlength = bytes.length;
request.timeout = this._timeout;
request.useragent = this._useragent;
//request.referer = string.concat("https://", uri.host);
if (this._proxyhost.length > 0)
{
request.proxy = new webproxy(this._proxyhost, this._proxyint);
}
using (stream requeststream = request.getrequeststream())
{
requeststream.write(bytes, 0, bytes.length);
requeststream.close();
}
try
{
encoding encoding;
response = (httpwebresponse)request.getresponse();
responsestream = response.getresponsestream();
if (this._httpencoding == null)
{
string str = response.characterset.tolower();
if (str.length > 3)
{
if (str.substring(0, 3) == "iso")
{
encoding = encoding.default;
}
else
{
encoding = encoding.getencoding(response.characterset);
}
}
else
{
encoding = encoding.getencoding(response.characterset);
}
if (str.length == 0)
{
encoding = encoding.default;
}
}
else
{
encoding = this._httpencoding;
}
info.html = new streamreader(responsestream, encoding).readtoend();
info.statuscode = httpstatuscode.ok;
responsestream.close();
response.close();
return info;
}
catch (exception ex)
{
info.html = "错误:" + ex.message;
}
}
catch (exception ex)
{
info.html = "错误:" + ex.message;
}
finally
{
if (responsestream != null)
responsestream.close();
if (response != null)
response.close();
}
return info;
}
#region 检查链接
/// <summary>
/// 检查链接是否存在
/// </summary>
/// <param name="surl"></param>
/// <param name="allowbadnum"></param>
public bool urlexist(string strurl)
{
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.method = "head";
request.allowautoredirect = false;
request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506; .net clr 3.5.21022; .net clr 1.0.3705; .net clr 1.1.4322)";
httpwebresponse response = (httpwebresponse)request.getresponse();
if (response.statuscode != httpstatuscode.ok)
{
response.close();
return false;
}
else
{
return true;
}
}
/// <summary>
/// 检查死链接是否在能容忍的数量内
/// </summary>
/// <param name="urls"></param>
/// <param name="allowbadnum"></param>
/// <returns></returns>
public bool urlexist(list<string> urls, int allowbadnum)
{
//如果图片的数量小于能容忍的数量就不用检查了。
if (urls.count <= allowbadnum)
{
return true;
}
int inttemp = 0;
foreach (string strurl in urls)
{
if (urlexist(strurl) == false)
{
inttemp++;
if (inttemp > allowbadnum)
{
return false;
}
}
}
return true;
}
#endregion
}
public class remoteres
{
private string _code;
private string _html;
private byte[] _bytes;
private string _contenttype;
private httpstatuscode _statuscode;
/// <summary>
/// 返回信息的代码
/// </summary>
public string code
{
get
{
return this._code;
}
set
{
this._code = value;
}
}
/// <summary>
/// 信息
/// </summary>
public string html
{
get
{
return this._html;
}
set
{
this._html = value;
}
}
/// <summary>
/// 远程资源
/// </summary>
public byte[] bytes
{
get
{
return this._bytes;
}
set
{
this._bytes = value;
}
}
/// <summary>
/// 内容类型
/// </summary>
public string contenttype
{
get
{
return this._contenttype;
}
set
{
this._contenttype = value;
}
}
/// <summary>
/// 状态代码
/// </summary>
public httpstatuscode statuscode
{
get
{
return this._statuscode;
}
set
{
this._statuscode = value;
}
}
}
}
摘自 winner2050的专栏
[csharp]
using system;
using system.collections.generic;
using system.io;
using system.io.compression;
using system.net;
using system.text;
namespace topwincms.common
{
public class nethelper
{
//private string _http_user_agent = "mozilla/4.0+(compatible;+msie+6.0;+windows+nt+5.2;+sv1;+.net+clr+1.1.4322;+.net+clr+2.0.50727)";
private string _useragent = "googlebot/2.1 (+https://www.google.com/bot.html)";
private encoding _httpencoding = null;
private string _proxyhost = string.empty;
private int _proxyint = 8080;
private int _timeout = 200000;
#region 属性
/// <summary>
/// 设置useragent
/// </summary>
public string useragent
{
get
{
return this._useragent;
}
set
{
this._useragent = value;
}
}
/// <summary>
/// 设置编码
/// </summary>
public encoding httpencoding
{
get
{
return this._httpencoding;
}
set
{
this._httpencoding = value;
}
}
/// <summary>
/// 设置代理服务器
/// </summary>
public string proxyhost
{
get
{
return this._proxyhost;
}
set
{
this._proxyhost = value;
}
}
/// <summary>
/// 设置代理服务器端口
/// </summary>
public int proxyint
{
get
{
return this._proxyint;
}
set
{
this._proxyint = value;
}
}
/// <summary>
/// 设置默认超时时间
/// </summary>
public int timeout
{
get
{
return this._timeout;
}
set
{
this._timeout = value;
}
}
#endregion
public remoteres get(string uri)
{
return get(new uri(uri));
}
public remoteres get(uri uri)
{
remoteres info = new remoteres();
httpwebrequest request = (httpwebrequest)webrequest.create(uri);
request.timeout = this._timeout;
request.useragent = this._useragent;
request.method = "get";
request.referer = string.concat("https://", uri.host);
if (this._proxyhost.length > 0)
{
request.proxy = new webproxy(this._proxyhost, this._proxyint);
}
httpwebresponse response = null;
stream responsestream = null;
try
{
encoding encoding;
response = (httpwebresponse)request.getresponse();
responsestream = response.getresponsestream();
if (response.headers["accept-encoding"] != null)
{
if (mycollections.contain(response.headers["accept-encoding"], "*", "gzip", "x-gzip"))
{
responsestream = new gzipstream(responsestream, compressionmode.decompress);
}
}
else if (response.headers["content-encoding"] != null)
{
if (mycollections.contain(response.headers["content-encoding"], "*", "gzip", "x-gzip"))
{
responsestream = new gzipstream(responsestream, compressionmode.decompress);
}
}
if (this._httpencoding == null)
{
string str = response.characterset.tolower();
if (str.length > 3)
{
if (str.substring(0, 3) == "iso")
{
encoding = encoding.default;
}
else
{
encoding = encoding.getencoding(response.characterset);
}
}
else
{
encoding = encoding.getencoding(response.characterset);
}
if (str.length == 0)
{
encoding = encoding.utf8;
}
}
else
{
encoding = this._httpencoding;
}
info.html = new streamreader(responsestream, encoding).readtoend();
info.contenttype = response.contenttype;
info.statuscode = response.statuscode;
}
catch (webexception we)
{
if (we.response != null)
{
info.statuscode = (we.response as httpwebresponse).statuscode;
}
else
{
info.statuscode = httpstatuscode.serviceunavailable;
}
info.code = "错误:" + we.message;
}
catch (exception ex)
{
info.code = "错误:" + ex.message;
info.statuscode = httpstatuscode.internalservererror;
}
finally
{
if (responsestream != null)
responsestream.close();
if (response != null)
response.close();
}
return info;
}
#region 取得远程资源
/// <summary>
/// 取得远程资源
/// </summary>
/// <param name="strurl">要取的url</param>
/// <returns>网页源代码</returns>
public remoteres getremoteresource(string strurl)
{
httpwebresponse response = null;
stream stream = null;
remoteres info = new remoteres();
try
{
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.allowautoredirect = true;
request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506)";
request.referer = "https://" + new uri(strurl).host;
response = request.getresponse() as httpwebresponse;
stream = response.getresponsestream();
info.contenttype = response.contenttype;
memorystream ms = new memorystream();
byte[] buffer = new byte[256];
int c = stream.read(buffer, 0, buffer.length);
while (c > 0)
{
ms.write(buffer, 0, c);
c = stream.read(buffer, 0, buffer.length);
}
stream.close();
info.statuscode = response.statuscode;
info.bytes = ms.toarray();
}
catch (webexception we)
{
if (we.response != null)
{
info.statuscode = (we.response as httpwebresponse).statuscode;
}
else
{
info.statuscode = httpstatuscode.serviceunavailable;
}
return null;
}
catch
{
info.statuscode = httpstatuscode.internalservererror;
return null;
}
finally
{
if (stream != null)
stream.close();
if (response != null)
response.close();
}
return info;
}
#endregion
public remoteres post(string strurl, string postdata)
{
remoteres info = new remoteres();
stream responsestream = null;
httpwebresponse response = null;
try
{
byte[] bytes = this._httpencoding.getbytes(postdata);
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.method = "post";
request.contenttype = "application/x-www-form-urlencoded";
request.contentlength = bytes.length;
request.timeout = this._timeout;
request.useragent = this._useragent;
//request.referer = string.concat("https://", uri.host);
if (this._proxyhost.length > 0)
{
request.proxy = new webproxy(this._proxyhost, this._proxyint);
}
using (stream requeststream = request.getrequeststream())
{
requeststream.write(bytes, 0, bytes.length);
requeststream.close();
}
try
{
encoding encoding;
response = (httpwebresponse)request.getresponse();
responsestream = response.getresponsestream();
if (this._httpencoding == null)
{
string str = response.characterset.tolower();
if (str.length > 3)
{
if (str.substring(0, 3) == "iso")
{
encoding = encoding.default;
}
else
{
encoding = encoding.getencoding(response.characterset);
}
}
else
{
encoding = encoding.getencoding(response.characterset);
}
if (str.length == 0)
{
encoding = encoding.default;
}
}
else
{
encoding = this._httpencoding;
}
info.html = new streamreader(responsestream, encoding).readtoend();
info.statuscode = httpstatuscode.ok;
responsestream.close();
response.close();
return info;
}
catch (exception ex)
{
info.html = "错误:" + ex.message;
}
}
catch (exception ex)
{
info.html = "错误:" + ex.message;
}
finally
{
if (responsestream != null)
responsestream.close();
if (response != null)
response.close();
}
return info;
}
#region 检查链接
/// <summary>
/// 检查链接是否存在
/// </summary>
/// <param name="surl"></param>
/// <param name="allowbadnum"></param>
public bool urlexist(string strurl)
{
httpwebrequest request = (httpwebrequest)webrequest.create(strurl);
request.method = "head";
request.allowautoredirect = false;
request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506; .net clr 3.5.21022; .net clr 1.0.3705; .net clr 1.1.4322)";
httpwebresponse response = (httpwebresponse)request.getresponse();
if (response.statuscode != httpstatuscode.ok)
{
response.close();
return false;
}
else
{
return true;
}
}
/// <summary>
/// 检查死链接是否在能容忍的数量内
/// </summary>
/// <param name="urls"></param>
/// <param name="allowbadnum"></param>
/// <returns></returns>
public bool urlexist(list<string> urls, int allowbadnum)
{
//如果图片的数量小于能容忍的数量就不用检查了。
if (urls.count <= allowbadnum)
{
return true;
}
int inttemp = 0;
foreach (string strurl in urls)
{
if (urlexist(strurl) == false)
{
inttemp++;
if (inttemp > allowbadnum)
{
return false;
}
}
}
return true;
}
#endregion
}
public class remoteres
{
private string _code;
private string _html;
private byte[] _bytes;
private string _contenttype;
private httpstatuscode _statuscode;
/// <summary>
/// 返回信息的代码
/// </summary>
public string code
{
get
{
return this._code;
}
set
{
this._code = value;
}
}
/// <summary>
/// 信息
/// </summary>
public string html
{
get
{
return this._html;
}
set
{
this._html = value;
}
}
/// <summary>
/// 远程资源
/// </summary>
public byte[] bytes
{
get
{
return this._bytes;
}
set
{
this._bytes = value;
}
}
/// <summary>
/// 内容类型
/// </summary>
public string contenttype
{
get
{
return this._contenttype;
}
set
{
this._contenttype = value;
}
}
/// <summary>
/// 状态代码
/// </summary>
public httpstatuscode statuscode
{
get
{
return this._statuscode;
}
set
{
this._statuscode = value;
}
}
}
}
摘自 winner2050的专栏
上一篇: Oracle CDC数据增量测试代码实例
下一篇: Log4Net使用指南