C#多线程爬虫抓取免费代理IP的示例代码

程序员文章站 2023-12-09 17:27:03

这里用到一个html解析辅助类：htmlagilitypack,如果没有网上找一个增加到库里，这个插件有很多版本,如果你开发环境是使用vs2005就2.0的类库，vs201...

这里用到一个html解析辅助类：htmlagilitypack,如果没有网上找一个增加到库里，这个插件有很多版本,如果你开发环境是使用vs2005就2.0的类库，vs2010就使用4.0,以此类推..........然后直接创建一个控制台应用，将我下面的代码copy替换就可以运行,下面就来讲讲我两年前做爬虫经历，当时是给一家公司做，也是用的c#，不过当时遇到一个头痛的问题就是抓的图片有病毒，然后系统挂了几次。所以抓网站图片要注意安全，虽然我这里没涉及到图片，但是还是提醒下看文章的朋友。

 class program
  {
    //存放所有抓取的代理
    public static list<proxy> masterporxylist = new list<proxy>();
    //代理ip类
    public class proxy
    {
      public string ip;

      public string port;
      public int speed;

      public proxy(string pip,string pport,int pspeed)
      
      {
        this.ip = pip;
        this.port = pport;
        this.speed = pspeed;
       }


    }
    //抓去处理方法
    static void getproxylist(object pageindex)
    {

      string urlcombin = "http://www.xicidaili.com/wt/" + pageindex.tostring();
      string catchhtml = catchproxipmethord(urlcombin, "utf8");
      

      htmlagilitypack.htmldocument doc = new htmlagilitypack.htmldocument();
      doc.loadhtml(catchhtml);


      htmlnode table = doc.documentnode.selectsinglenode("//div[@id='wrapper']//div[@id='body']/table[1]");

      htmlnodecollection collectiontrs = table.selectnodes("./tr");  

 
      
        for (int i = 0; i < collectiontrs.count; i++)
        {
          htmlagilitypack.htmlnode itemtr = collectiontrs[i];


          htmlnodecollection collectiontds = itemtr.childnodes;
          //table中第一个是能用的代理标题，所以这里从第二行tr开始取值
          if (i>0)
          {
            htmlnode itemtdip = (htmlnode)collectiontds[3];

            htmlnode itemtdport = (htmlnode)collectiontds[5];

            htmlnode itemtdspeed = (htmlnode)collectiontds[13];
 
            string ip = itemtdip.innertext.trim();
            string port = itemtdport.innertext.trim();


            string speed = itemtdspeed.innerhtml;
            int beginindex = speed.indexof(":", 0, speed.length);
            int endindex = speed.indexof("%", 0, speed.length);

            int subspeed = int.parse(speed.substring(beginindex + 1, endindex - beginindex - 1));
            //如果速度展示条的值大于90,表示这个代理速度快。
           if (subspeed > 90)
            {
              proxy temp = new proxy(ip, port, subspeed);
              
              masterporxylist.add(temp);
              console.writeline("当前是第:" + masterporxylist.count.tostring() + "个代理ip");
            }
          
           }


        }
 
    }

    //抓网页方法
    static string catchproxipmethord(string url,string encoding )
    {

      string htmlstr = "";
      try
      {
        if (!string.isnullorempty(url))
        {
          webrequest request = webrequest.create(url);  
          webresponse response = request.getresponse();      
          stream datastream = response.getresponsestream(); 
          encoding ec = encoding.default;
          if (encoding == "utf8")
          {
            ec = encoding.utf8;
          }
          else if (encoding == "default")
          {
            ec = encoding.default;
          }
          streamreader reader = new streamreader(datastream, ec);
          htmlstr = reader.readtoend();        
          reader.close();
          datastream.close();
          response.close();
        }
      }
      catch { }
      return htmlstr;
    }


   static void main(string[] args)
     {
       //多线程同时抓15页
       for (int i = 1; i <= 15; i++)
       {
        
         
         threadpool.queueuserworkitem(getproxylist, i);
       }
       console.read();
     }

  }

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持。

上一篇：索尼PS4/PS Vita游戏机正式入华 PS4售2899元

C#多线程爬虫抓取免费代理IP的示例代码

尝试使用Python多线程抓取代理服务器IP地址的示例

尝试使用Python多线程抓取代理服务器IP地址的示例

Python之多线程爬虫抓取网页图片的示例代码

尝试使用Python多线程抓取代理服务器IP地址的示例

Python之多线程爬虫抓取网页图片的示例代码

尝试使用Python多线程抓取代理服务器IP地址的示例

python爬虫批量抓取ip代理的方法（代码）

python爬虫批量抓取ip代理的方法（代码）