C#制作多线程处理强化版网络爬虫
程序员文章站
2022-03-26 08:36:25
上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。
说说思路:首相获取初始网...
上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。
说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环
还是上图片大家看一下,
处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!
网页内容抓取htmlcoderequest,
网页网址爬取gethttplinks,用正则去筛选html中的links
图片抓取gethtmlimageurllist,用正则去筛选html中的img
都写进了一个封装类里面 httphelper
/// <summary> /// 取得html中所有图片的 url。 /// </summary> /// <param name="shtmltext">html代码</param> /// <returns>图片的url列表</returns> public static string htmlcoderequest(string url) { if (string.isnullorempty(url)) { return ""; } try { //创建一个请求 httpwebrequest httprequst = (httpwebrequest)webrequest.create(url); //不建立持久性链接 httprequst.keepalive = true; //设置请求的方法 httprequst.method = "get"; //设置标头值 httprequst.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; httprequst.accept = "*/*"; httprequst.headers.add("accept-language", "zh-cn,en-us;q=0.5"); httprequst.servicepoint.expect100continue = false; httprequst.timeout = 5000; httprequst.allowautoredirect = true;//是否允许302 servicepointmanager.defaultconnectionlimit = 30; //获取响应 httpwebresponse webres = (httpwebresponse)httprequst.getresponse(); //获取响应的文本流 string content = string.empty; using (system.io.stream stream = webres.getresponsestream()) { using (system.io.streamreader reader = new streamreader(stream, system.text.encoding.getencoding("utf-8"))) { content = reader.readtoend(); } } //取消请求 httprequst.abort(); //返回数据内容 return content; } catch (exception) { return ""; } } /// <summary> /// 提取页面链接 /// </summary> /// <param name="html"></param> /// <returns></returns> public static list<string> gethtmlimageurllist(string url) { string html = httphelper.htmlcoderequest(url); if (string.isnullorempty(html)) { return new list<string>(); } // 定义正则表达式用来匹配 img 标签 regex regimg = new regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", regexoptions.ignorecase); // 搜索匹配的字符串 matchcollection matches = regimg.matches(html); list<string> surllist = new list<string>(); // 取得匹配项列表 foreach (match match in matches) surllist.add(match.groups["imgurl"].value); return surllist; } /// <summary> /// 提取页面链接 /// </summary> /// <param name="html"></param> /// <returns></returns> public static list<string> gethttplinks(string url) { //获取网址内容 string html = httphelper.htmlcoderequest(url); if (string.isnullorempty(html)) { return new list<string>(); } //匹配http链接 const string pattern2 = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; regex r2 = new regex(pattern2, regexoptions.ignorecase); //获得匹配结果 matchcollection m2 = r2.matches(html); list<string> links = new list<string>(); foreach (match url2 in m2) { if (stringhelper.checkurlislegal(url2.tostring()) || !stringhelper.ispureurl(url2.tostring()) || links.contains(url2.tostring())) continue; links.add(url2.tostring()); } //匹配href里面的链接 const string pattern = @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__dopostback)(?<url>[^'""\s*#<>]+)[^>]*>"; ; regex r = new regex(pattern, regexoptions.ignorecase); //获得匹配结果 matchcollection m = r.matches(html); foreach (match url1 in m) { string href1 = url1.groups["url"].value; if (!href1.contains("http")) { href1 = global.weburl + href1; } if (!stringhelper.ispureurl(href1) || links.contains(href1)) continue; links.add(href1); } return links; }
这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托
public string downloadimg(string url) { if (!string.isnullorempty(url)) { try { if (!url.contains("http")) { url = global.weburl + url; } httpwebrequest request = (httpwebrequest)webrequest.create(url); request.timeout = 2000; request.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; //是否允许302 request.allowautoredirect = true; webresponse response = request.getresponse(); stream reader = response.getresponsestream(); //文件名 string afirstname = guid.newguid().tostring(); //扩展名 string alastname = url.substring(url.lastindexof(".") + 1, (url.length - url.lastindexof(".") - 1)); filestream writer = new filestream(global.floderurl + afirstname + "." + alastname, filemode.openorcreate, fileaccess.write); byte[] buff = new byte[512]; //实际读取的字节数 int c = 0; while ((c = reader.read(buff, 0, buff.length)) > 0) { writer.write(buff, 0, c); } writer.close(); writer.dispose(); reader.close(); reader.dispose(); response.close(); return (afirstname + "." + alastname); } catch (exception) { return "错误:地址" + url; } } return "错误:地址为空"; }
话不多说,更多的需要大家自己去改进咯!欢迎读者来与楼主进行交流。