基于C#实现网页爬虫

程序员文章站 2022-11-23 23:41:23

本文实例为大家分享了基于c#实现网页爬虫的详细代码，供大家参考，具体内容如下 http请求工具类：功能： 1、获取网页html 2、下载网络图片 usi...

本文实例为大家分享了基于c#实现网页爬虫的详细代码，供大家参考，具体内容如下

http请求工具类：

功能：

1、获取网页html

2、下载网络图片

using system;
using system.collections.generic;
using system.io;
using system.linq;
using system.net;
using system.text;
using system.threading.tasks;
using system.windows.forms;

namespace utils
{
  /// <summary>
  /// http请求工具类
  /// </summary>
  public class httprequestutil
  {
    /// <summary>
    /// 获取页面html
    /// </summary>
    public static string getpagehtml(string url)
    {
      // 设置参数
      httpwebrequest request = webrequest.create(url) as httpwebrequest;
      request.useragent = "mozilla/4.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0)";
      //发送请求并获取相应回应数据
      httpwebresponse response = request.getresponse() as httpwebresponse;
      //直到request.getresponse()程序才开始向目标网页发送post请求
      stream responsestream = response.getresponsestream();
      streamreader sr = new streamreader(responsestream, encoding.utf8);
      //返回结果网页（html）代码
      string content = sr.readtoend();
      return content;
    }

    /// <summary>
    /// http下载文件
    /// </summary>
    public static void httpdownloadfile(string url)
    {
      int pos = url.lastindexof("/") + 1;
      string filename = url.substring(pos);
      string path = application.startuppath + "\\download";
      if (!directory.exists(path))
      {
        directory.createdirectory(path);
      }
      string filepathname = path + "\\" + filename;
      if (file.exists(filepathname)) return;

      // 设置参数
      httpwebrequest request = webrequest.create(url) as httpwebrequest;
      request.useragent = "mozilla/4.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0)";
      request.proxy = null;
      //发送请求并获取相应回应数据
      httpwebresponse response = request.getresponse() as httpwebresponse;
      //直到request.getresponse()程序才开始向目标网页发送post请求
      stream responsestream = response.getresponsestream();

      //创建本地文件写入流
      stream stream = new filestream(filepathname, filemode.create);

      byte[] barr = new byte[1024];
      int size = responsestream.read(barr, 0, (int)barr.length);
      while (size > 0)
      {
        stream.write(barr, 0, size);
        size = responsestream.read(barr, 0, (int)barr.length);
      }
      stream.close();
      responsestream.close();
    }
  }
}

多线程爬取网页代码：

using system;
using system.collections.generic;
using system.componentmodel;
using system.data;
using system.drawing;
using system.io;
using system.linq;
using system.text;
using system.text.regularexpressions;
using system.threading;
using system.threading.tasks;
using system.windows.forms;
using utils;

namespace 爬虫
{
  public partial class form1 : form
  {
    list<thread> threadlist = new list<thread>();
    thread thread = null;

    public form1()
    {
      initializecomponent();
    }

    private void button1_click(object sender, eventargs e)
    {
      datetime dtstart = datetime.now;
      button3.enabled = true;
      button2.enabled = true;
      button1.enabled = false;
      int page = 0;
      int count = 0;
      int personcount = 0;
      lblpage.text = "已完成页数：0";
      int index = 0;

      for (int i = 1; i <= 10; i++)
      {
        thread = new thread(new parameterizedthreadstart(delegate(object obj)
        {
          for (int j = 1; j <= 10; j++)
          {
            try
            {
              index = (convert.toint32(obj) - 1) * 10 + j;
              string pagehtml = httprequestutil.getpagehtml("http://tt.mop.com/c44/0/1_" + index.tostring() + ".html");
              regex rega = new regex("<a[\\s]+class=\"j-userpic([^<>]*?)[\\s]+href=\"([^\"]*?)\"");
              regex regimg = new regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"");
              matchcollection mc = rega.matches(pagehtml);
              foreach (match match in mc)
              {
                int start = match.tostring().indexof("href=\"");
                string url = match.tostring().substring(start + 6);
                int end = url.indexof("\"");
                url = url.substring(0, end);
                if (url.indexof("/") == 0)
                {
                  string imgpagehtml = httprequestutil.getpagehtml("http://tt.mop.com" + url);
                  personcount++;
                  lblperson.invoke(new action(delegate() { lblperson.text = "已完成条数：" + personcount.tostring(); }));
                  matchcollection mcimgpage = regimg.matches(imgpagehtml);
                  foreach (match matchimgpage in mcimgpage)
                  {
                    start = matchimgpage.tostring().indexof("src=\"");
                    string imgurl = matchimgpage.tostring().substring(start + 5);
                    end = imgurl.indexof("\"");
                    imgurl = imgurl.substring(0, end);
                    if (imgurl.indexof("http://i1") == 0)
                    {
                      try
                      {
                        httprequestutil.httpdownloadfile(imgurl);
                        count++;
                        lblnum.invoke(new action(delegate()
                        {
                          lblnum.text = "已下载图片数" + count.tostring();
                          datetime dt = datetime.now;
                          double time = dt.subtract(dtstart).totalseconds;
                          if (time > 0)
                          {
                            lblspeed.text = "速度：" + (count / time).tostring("0.0") + "张/秒";
                          }
                        }));
                      }
                      catch { }
                      thread.sleep(1);
                    }
                  }
                }
              }
            }
            catch { }
            page++;
            lblpage.invoke(new action(delegate() { lblpage.text = "已完成页数：" + page.tostring(); }));

            if (page == 100)
            {
              button1.invoke(new action(delegate() { button1.enabled = true; }));
              messagebox.show("完成！");
            }
          }
        }));
        thread.start(i);
        threadlist.add(thread);
      }
    }

    private void button2_click(object sender, eventargs e)
    {
      button1.invoke(new action(delegate()
      {
        foreach (thread thread in threadlist)
        {
          if (thread.threadstate == threadstate.suspended)
          {
            thread.resume();
          }
          thread.abort();
        }
        button1.enabled = true;
        button2.enabled = false;
        button3.enabled = false;
        button4.enabled = false;
      }));
    }

    private void form1_formclosing(object sender, formclosingeventargs e)
    {
      foreach (thread thread in threadlist)
      {
        thread.abort();
      }
    }

    private void button3_click(object sender, eventargs e)
    {
      foreach (thread thread in threadlist)
      {
        if (thread.threadstate == threadstate.running)
        {
          thread.suspend();
        }
      }
      button3.enabled = false;
      button4.enabled = true;
    }

    private void button4_click(object sender, eventargs e)
    {
      foreach (thread thread in threadlist)
      {
        if (thread.threadstate == threadstate.suspended)
        {
          thread.resume();
        }
      }
      button3.enabled = true;
      button4.enabled = false;
    }
  }
}

截图：

基于C#实现网页爬虫

以上就是本文的全部内容，希望对大家的学习有所帮助。

上一篇： jquery中为什么能用$操作

下一篇： C#6.0中10大新特性的应用和总结

基于C#实现网页爬虫

C#使用WebClient登录网站并抓取登录后的网页信息实现方法

C#实现基于Base64的加密解密类实例

C#基于正则表达式实现获取网页中所有信息的网页抓取类实例

C#基于SQLiteHelper类似SqlHelper类实现存取Sqlite数据库的方法

C#实现基于XML配置MenuStrip菜单的方法

python基于BeautifulSoup实现抓取网页指定内容的方法

C#实现的基于二进制读写文件操作示例

C#实现的滚动网页截图功能示例

C#实现Winform中打开网页页面的方法

基python实现多线程网页爬虫