asp.net 抓取网页源码三种实现方法
程序员文章站
2024-02-29 23:48:40
方法1 比较推荐
///
/// 用httpwebrequest取得网页源码...
方法1 比较推荐
/// <summary> /// 用httpwebrequest取得网页源码 /// 对于带bom的网页很有效,不管是什么编码都能正确识别 /// </summary> /// <param name="url">网页地址" </param> /// <returns>返回网页源文件</returns> public static string gethtmlsource2(string url) { //处理内容 string html = ""; httpwebrequest request = (httpwebrequest)webrequest.create(url); request.accept = "*/*"; //接受任意文件 request.useragent = "mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.1.4322)"; // 模拟使用ie在浏览 http://www.52mvc.com request.allowautoredirect = true;//是否允许302 //request.cookiecontainer = new cookiecontainer();//cookie容器, request.referer = url; //当前页面的引用 httpwebresponse response = (httpwebresponse)request.getresponse(); stream stream = response.getresponsestream(); streamreader reader = new streamreader(stream, encoding.default); html = reader.readtoend(); stream.close(); return html; }
方法2
using system; using system.collections.generic; using system.linq; using system.web; using system.io; using system.text; using system.net; namespace mysql { public class gethttpdata { public static string gethttpdata2(string url) { string sexception = null; string srslt = null; webresponse owebrps = null; webrequest owebrqst = webrequest.create(url); owebrqst.timeout = 50000; try { owebrps = owebrqst.getresponse(); } catch (webexception e) { sexception = e.message.tostring(); } catch (exception e) { sexception = e.tostring(); } finally { if (owebrps != null) { streamreader ostreamrd = new streamreader(owebrps.getresponsestream(), encoding.getencoding("utf-8")); srslt = ostreamrd.readtoend(); ostreamrd.close(); owebrps.close(); } } return srslt; } } }
方法3
public static string gethtml(string url, params string [] charsets)//url是要访问的网站地址,charset是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 { try { string charset = null; if (charsets.length == 1) { charset = charsets[0]; } webclient mywebclient = new webclient(); //创建webclient实例mywebclient // 需要注意的: //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 //这是就要具体问题具体分析比如在头部加入cookie // webclient.headers.add("cookie", cookie); //这样可能需要一些重载方法。根据需要写就可以了 //获取或设置用于对向 internet 资源的请求进行身份验证的网络凭据。 mywebclient.credentials = credentialcache.defaultcredentials; //如果服务器要验证用户名,密码 //networkcredential mycred = new networkcredential(struser, strpassword); //mywebclient.credentials = mycred; //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) byte[] mydatabuffer = mywebclient.downloaddata(url); string strwebdata = encoding.default.getstring(mydatabuffer); //获取网页字符编码描述信息 match charsetmatch = regex.match(strwebdata, "<meta([^<]*)charset=([^<]*)\"", regexoptions.ignorecase | regexoptions.multiline); string webcharset = charsetmatch.groups[2].value; if (charset == null || charset == "") charset = webcharset; if (charset != null && charset != "" && encoding.getencoding(charset) != encoding.default) { strwebdata = encoding.getencoding(charset).getstring(mydatabuffer); } else { strwebdata = encoding.getencoding("utf-8").getstring(mydatabuffer); } return strwebdata; } catch (exception e) { return ""; } }
asp.net 获取网页源文件的方法
有时候我们需要获取 网页源文件,所以用以下这个方法很容易完成任务!
private string getstringbyurl(string strurl) { webrequest wrt = webrequest.create(strurl); webresponse wrse = wrt.getresponse(); stream strm = wrse.getresponsestream(); streamreader sr = new streamreader(strm, encoding.getencoding("gb2312")); string strallstrm = sr.readtoend(); return strallstrm; }
只要传入要下载网页的地址就ok了!
通过这个方法做个源码导出:
private string savehtml() { string str = renderpage("default2.aspx"); response.contentencoding = system.text.encoding.getencoding("utf-8"); //解决中文乱码 response.addheader("content-disposition","attachment;filename=index.html"); //解决中文文件名乱码 response.addheader("content-length",str.length.tostring()); response.write(str); response.end(); }
以上就是asp.net 抓取网页源码的全部代码了,希望对大家有所帮助。