三种asp.net 抓取网页源码实现(asp.net采集,读取源码)
/// <summary>
/// 用httpwebrequest取得网页
/// 对于带bom的网页很有效,不管是什么编码都能正确识别
/// </summary>
/// <param name="url">网页地址" </param>
/// <returns>返回网页源文件</returns>
public static string gethtmlsource2(string url)
{
//处理内容
string html = "";
httpwebrequest request = (httpwebrequest)webrequest.create(url);
request.accept = "*/*"; //接受任意文件
request.useragent = "mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.1.4322)"; // 模拟使用ie在浏览 https://www.52m.com
request.allowautoredirect = true;//是否允许302
//request.cookiecontainer = new cookiecontainer();//cookie容器,
request.referer = url; //当前页面的引用
httpwebresponse response = (httpwebresponse)request.getresponse();
stream stream = response.getresponsestream();
streamreader reader = new streamreader(stream, encoding.default);
html = reader.readtoend();
stream.close();
return html;
}
写法2
using system;
using system.collections.generic;
using system.linq;
using system.web;
using system.io;
using system.text;
using system.net;
namespace mysql
{
public class gethttpdata
{
public static string gethttpdata2(string url)
{
string sexception = null;
string srslt = null;
webresponse owebrps = null;
webrequest owebrqst = webrequest.create(url);
owebrqst.timeout = 50000;
try
{
owebrps = owebrqst.getresponse();
}
catch (webexception e)
{
sexception = e.message.tostring();
}
catch (exception e)
{
sexception = e.tostring();
}
finally
{
if (owebrps != null)
{
streamreader ostreamrd = new streamreader(owebrps.getresponsestream(), encoding.getencoding("utf-8"));
srslt = ostreamrd.readtoend();
ostreamrd.close();
owebrps.close();
}
}
return srslt;
}
}
}
写法3
public static string gethtml(string url, params string [] charsets)//url是要访问的网站地址,charset是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
{
try
{
string charset = null;
if (charsets.length == 1) {
charset = charsets[0];
}
webclient mywebclient = new webclient(); //创建webclient实例mywebclient
// 需要注意的:
//有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
//这是就要具体问题具体分析比如在头部加入cookie
// webclient.headers.add("cookie", cookie);
//这样可能需要一些重载方法。根据需要写就可以了
//获取或设置用于对向 internet 资源的请求进行身份验证的网络凭据。
mywebclient.credentials = credentialcache.defaultcredentials;
//如果服务器要验证用户名,密码
//networkcredential mycred = new networkcredential(struser, strpassword);
//mywebclient.credentials = mycred;
//从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
byte[] mydatabuffer = mywebclient.downloaddata(url);
string strwebdata = encoding.default.getstring(mydatabuffer);
//获取网页字符编码描述信息
match charsetmatch = regex.match(strwebdata, "<meta([^<]*)charset=([^<]*)\"", regexoptions.ignorecase | regexoptions.multiline);
string webcharset = charsetmatch.groups[2].value;
if (charset == null || charset == "")
charset = webcharset;
if (charset != null && charset != "" && encoding.getencoding(charset) != encoding.default)
{
strwebdata = encoding.getencoding(charset).getstring(mydatabuffer);
}
else {
strwebdata = encoding.getencoding("utf-8").getstring(mydatabuffer);
}
return strwebdata;
}
catch (exception e) { return ""; }
}