欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

asp.net 抓取网页源码三种实现方法

程序员文章站 2024-02-29 23:48:40
方法1 比较推荐   /// /// 用httpwebrequest取得网页源码...

方法1 比较推荐  

/// <summary>  
 
    /// 用httpwebrequest取得网页源码  
    /// 对于带bom的网页很有效,不管是什么编码都能正确识别  
    /// </summary>  
    /// <param name="url">网页地址" </param>  
    /// <returns>返回网页源文件</returns>  
    public static string gethtmlsource2(string url)
    {
      //处理内容  
      string html = "";
      httpwebrequest request = (httpwebrequest)webrequest.create(url);
      request.accept = "*/*"; //接受任意文件
      request.useragent = "mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.1.4322)"; // 模拟使用ie在浏览 http://www.52mvc.com
      request.allowautoredirect = true;//是否允许302
      //request.cookiecontainer = new cookiecontainer();//cookie容器,
      request.referer = url; //当前页面的引用
 
 
      httpwebresponse response = (httpwebresponse)request.getresponse();
      stream stream = response.getresponsestream();
      streamreader reader = new streamreader(stream, encoding.default);
      html = reader.readtoend();
      stream.close();
 
 
      return html;
    }

方法2 

using system;
using system.collections.generic;
using system.linq;
using system.web;
using system.io;
using system.text;
using system.net;

namespace mysql
{
  public class gethttpdata
  {
    public static string gethttpdata2(string url)
    {
      string sexception = null;
      string srslt = null;
      webresponse owebrps = null;
      webrequest owebrqst = webrequest.create(url);
      owebrqst.timeout = 50000;
      try
      {

        owebrps = owebrqst.getresponse();

      }
      catch (webexception e)
      {
        sexception = e.message.tostring();
      }
      catch (exception e)
      {
        sexception = e.tostring();
 
      }
      finally
      {
        if (owebrps != null)
        {
 
          streamreader ostreamrd = new streamreader(owebrps.getresponsestream(), encoding.getencoding("utf-8"));
          srslt = ostreamrd.readtoend();
          ostreamrd.close();
          owebrps.close();
        }
      }
 
      return srslt;
    }
 
  }
}

方法3

public static string gethtml(string url, params string [] charsets)//url是要访问的网站地址,charset是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
  {
    try
    {
      string charset = null;
      if (charsets.length == 1) {
        charset = charsets[0];
      }
      webclient mywebclient = new webclient(); //创建webclient实例mywebclient
      // 需要注意的:
      //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
      //这是就要具体问题具体分析比如在头部加入cookie
      // webclient.headers.add("cookie", cookie);
      //这样可能需要一些重载方法。根据需要写就可以了
 
 
      //获取或设置用于对向 internet 资源的请求进行身份验证的网络凭据。
      mywebclient.credentials = credentialcache.defaultcredentials;
      //如果服务器要验证用户名,密码
      //networkcredential mycred = new networkcredential(struser, strpassword);
      //mywebclient.credentials = mycred;
      //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
      byte[] mydatabuffer = mywebclient.downloaddata(url);
      string strwebdata = encoding.default.getstring(mydatabuffer);
 
 
      //获取网页字符编码描述信息
      match charsetmatch = regex.match(strwebdata, "<meta([^<]*)charset=([^<]*)\"", regexoptions.ignorecase | regexoptions.multiline);
      string webcharset = charsetmatch.groups[2].value;
      if (charset == null || charset == "")
        charset = webcharset;
 
 
      if (charset != null && charset != "" && encoding.getencoding(charset) != encoding.default)
      {
        strwebdata = encoding.getencoding(charset).getstring(mydatabuffer);
      }
      else {
        strwebdata = encoding.getencoding("utf-8").getstring(mydatabuffer);
      }
      return strwebdata;
    }
    catch (exception e) { return ""; }
  }

asp.net 获取网页源文件的方法

有时候我们需要获取 网页源文件,所以用以下这个方法很容易完成任务!

private string getstringbyurl(string strurl) 
{ 
  webrequest wrt = webrequest.create(strurl); 
  webresponse wrse = wrt.getresponse(); 
  stream strm = wrse.getresponsestream(); 
  streamreader sr = new streamreader(strm,  encoding.getencoding("gb2312")); 
  string strallstrm = sr.readtoend(); 
  return strallstrm; 
} 

只要传入要下载网页的地址就ok了!
通过这个方法做个源码导出:

private string savehtml() 
 {     
string str = renderpage("default2.aspx"); 
    response.contentencoding = system.text.encoding.getencoding("utf-8"); //解决中文乱码 
    response.addheader("content-disposition","attachment;filename=index.html"); //解决中文文件名乱码   
    response.addheader("content-length",str.length.tostring()); 
    response.write(str); 
    response.end(); 
} 

以上就是asp.net 抓取网页源码的全部代码了,希望对大家有所帮助。