基于C#实现网页爬虫
程序员文章站
2022-11-23 23:41:23
本文实例为大家分享了基于c#实现网页爬虫的详细代码,供大家参考,具体内容如下
http请求工具类:
功能:
1、获取网页html
2、下载网络图片
usi...
本文实例为大家分享了基于c#实现网页爬虫的详细代码,供大家参考,具体内容如下
http请求工具类:
功能:
1、获取网页html
2、下载网络图片
using system; using system.collections.generic; using system.io; using system.linq; using system.net; using system.text; using system.threading.tasks; using system.windows.forms; namespace utils { /// <summary> /// http请求工具类 /// </summary> public class httprequestutil { /// <summary> /// 获取页面html /// </summary> public static string getpagehtml(string url) { // 设置参数 httpwebrequest request = webrequest.create(url) as httpwebrequest; request.useragent = "mozilla/4.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0)"; //发送请求并获取相应回应数据 httpwebresponse response = request.getresponse() as httpwebresponse; //直到request.getresponse()程序才开始向目标网页发送post请求 stream responsestream = response.getresponsestream(); streamreader sr = new streamreader(responsestream, encoding.utf8); //返回结果网页(html)代码 string content = sr.readtoend(); return content; } /// <summary> /// http下载文件 /// </summary> public static void httpdownloadfile(string url) { int pos = url.lastindexof("/") + 1; string filename = url.substring(pos); string path = application.startuppath + "\\download"; if (!directory.exists(path)) { directory.createdirectory(path); } string filepathname = path + "\\" + filename; if (file.exists(filepathname)) return; // 设置参数 httpwebrequest request = webrequest.create(url) as httpwebrequest; request.useragent = "mozilla/4.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0)"; request.proxy = null; //发送请求并获取相应回应数据 httpwebresponse response = request.getresponse() as httpwebresponse; //直到request.getresponse()程序才开始向目标网页发送post请求 stream responsestream = response.getresponsestream(); //创建本地文件写入流 stream stream = new filestream(filepathname, filemode.create); byte[] barr = new byte[1024]; int size = responsestream.read(barr, 0, (int)barr.length); while (size > 0) { stream.write(barr, 0, size); size = responsestream.read(barr, 0, (int)barr.length); } stream.close(); responsestream.close(); } } }
多线程爬取网页代码:
using system; using system.collections.generic; using system.componentmodel; using system.data; using system.drawing; using system.io; using system.linq; using system.text; using system.text.regularexpressions; using system.threading; using system.threading.tasks; using system.windows.forms; using utils; namespace 爬虫 { public partial class form1 : form { list<thread> threadlist = new list<thread>(); thread thread = null; public form1() { initializecomponent(); } private void button1_click(object sender, eventargs e) { datetime dtstart = datetime.now; button3.enabled = true; button2.enabled = true; button1.enabled = false; int page = 0; int count = 0; int personcount = 0; lblpage.text = "已完成页数:0"; int index = 0; for (int i = 1; i <= 10; i++) { thread = new thread(new parameterizedthreadstart(delegate(object obj) { for (int j = 1; j <= 10; j++) { try { index = (convert.toint32(obj) - 1) * 10 + j; string pagehtml = httprequestutil.getpagehtml("http://tt.mop.com/c44/0/1_" + index.tostring() + ".html"); regex rega = new regex("<a[\\s]+class=\"j-userpic([^<>]*?)[\\s]+href=\"([^\"]*?)\""); regex regimg = new regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\""); matchcollection mc = rega.matches(pagehtml); foreach (match match in mc) { int start = match.tostring().indexof("href=\""); string url = match.tostring().substring(start + 6); int end = url.indexof("\""); url = url.substring(0, end); if (url.indexof("/") == 0) { string imgpagehtml = httprequestutil.getpagehtml("http://tt.mop.com" + url); personcount++; lblperson.invoke(new action(delegate() { lblperson.text = "已完成条数:" + personcount.tostring(); })); matchcollection mcimgpage = regimg.matches(imgpagehtml); foreach (match matchimgpage in mcimgpage) { start = matchimgpage.tostring().indexof("src=\""); string imgurl = matchimgpage.tostring().substring(start + 5); end = imgurl.indexof("\""); imgurl = imgurl.substring(0, end); if (imgurl.indexof("http://i1") == 0) { try { httprequestutil.httpdownloadfile(imgurl); count++; lblnum.invoke(new action(delegate() { lblnum.text = "已下载图片数" + count.tostring(); datetime dt = datetime.now; double time = dt.subtract(dtstart).totalseconds; if (time > 0) { lblspeed.text = "速度:" + (count / time).tostring("0.0") + "张/秒"; } })); } catch { } thread.sleep(1); } } } } } catch { } page++; lblpage.invoke(new action(delegate() { lblpage.text = "已完成页数:" + page.tostring(); })); if (page == 100) { button1.invoke(new action(delegate() { button1.enabled = true; })); messagebox.show("完成!"); } } })); thread.start(i); threadlist.add(thread); } } private void button2_click(object sender, eventargs e) { button1.invoke(new action(delegate() { foreach (thread thread in threadlist) { if (thread.threadstate == threadstate.suspended) { thread.resume(); } thread.abort(); } button1.enabled = true; button2.enabled = false; button3.enabled = false; button4.enabled = false; })); } private void form1_formclosing(object sender, formclosingeventargs e) { foreach (thread thread in threadlist) { thread.abort(); } } private void button3_click(object sender, eventargs e) { foreach (thread thread in threadlist) { if (thread.threadstate == threadstate.running) { thread.suspend(); } } button3.enabled = false; button4.enabled = true; } private void button4_click(object sender, eventargs e) { foreach (thread thread in threadlist) { if (thread.threadstate == threadstate.suspended) { thread.resume(); } } button3.enabled = true; button4.enabled = false; } } }
截图:
以上就是本文的全部内容,希望对大家的学习有所帮助。
上一篇: jquery中为什么能用$操作
下一篇: C#6.0中10大新特性的应用和总结