C#网络爬虫代码分享 C#简单的爬取工具
程序员文章站
2022-04-15 10:13:43
公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具
这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考...
公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具
这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考
private string gethttpwebrequest(string url) { httpwebresponse result; string strhtml = string.empty; try { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); webresponse webres = webreq.getresponse(); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; myreq.accept = "*/*"; myreq.keepalive = true; myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); result = (httpwebresponse)myreq.getresponse(); stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("utf-8")); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } catch { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; myreq.accept = "*/*"; myreq.keepalive = true; myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); //result = (httpwebresponse)myreq.getresponse(); try { result = (httpwebresponse)myreq.getresponse(); } catch (webexception ex) { result = (httpwebresponse)ex.response; } stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("gb2312")); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } return strhtml; }
这是根据url爬取网页远吗,有一些小改动,很多网页有不同的编码格式,甚至有些网站做了反爬取的防范,这个方法经过能够改动也能爬去
以下是爬取网页所有的网址链接
/// <summary> /// 提取html代码中的网址 /// </summary> /// <param name="htmlcode"></param> /// <returns></returns> private static list<string> gethyperlinks(string htmlcode, string url) { arraylist al = new arraylist(); bool isgenxin = false; stringbuilder weburlsb = new stringbuilder();//sql stringbuilder linksb = new stringbuilder();//展示数据 list<string> weburllistzx = new list<string>();//新增 list<string> weburllist = new list<string>();//旧的 string productioncontent = htmlcode; regex reg = new regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); string wangzhanyuming = reg.match(url, 0).value; matchcollection mc = regex.matches(productioncontent.replace("href=\"/", "href=\"" + wangzhanyuming).replace("href='/", "href='" + wangzhanyuming).replace("href=/", "href=" + wangzhanyuming).replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>", regexoptions.singleline); int index = 1; foreach (match m in mc) { matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*", regexoptions.singleline); if (mc1.count > 0) { foreach (match m1 in mc1) { string linkurlstr = string.empty; linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); weburlsb.append("$-$"); weburlsb.append(linkurlstr); weburlsb.append("$_$"); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true; weburllistzx.add(linkurlstr); linksb.appendformat("{0}<br/>", linkurlstr); } } } else { if (m.value.indexof("javascript") == -1) { string amstr = string.empty; string wangzhanxiangduilujin = string.empty; wangzhanxiangduilujin = url.substring(0, url.lastindexof("/") + 1); amstr = m.value.replace("href=\"", "href=\"" + wangzhanxiangduilujin).replace("href='", "href='" + wangzhanxiangduilujin); matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*", regexoptions.singleline); foreach (match m1 in mc11) { string linkurlstr = string.empty; linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); weburlsb.append("$-$"); weburlsb.append(linkurlstr); weburlsb.append("$_$"); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true; weburllistzx.add(linkurlstr); linksb.appendformat("{0}<br/>", linkurlstr); } } } } index++; } return weburllistzx; }
这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法
/// <summary> /// // 把网址写入xml文件 /// </summary> /// <param name="strurl"></param> /// <param name="alhyperlinks"></param> private static void writetoxml(string strurl, list<string> alhyperlinks) { xmltextwriter writer = new xmltextwriter(@"d:\hyperlinks.xml", encoding.utf8); writer.formatting = formatting.indented; writer.writestartdocument(false); writer.writedoctype("hyperlinks", null, "urls.dtd", null); writer.writecomment("提取自" + strurl + "的超链接"); writer.writestartelement("hyperlinks"); writer.writestartelement("hyperlinks", null); writer.writeattributestring("datetime", datetime.now.tostring()); foreach (string str in alhyperlinks) { string title = getdomain(str); string body = str; writer.writeelementstring(title, null, body); } writer.writeendelement(); writer.writeendelement(); writer.flush(); writer.close(); } /// <summary> /// 获取网址的域名后缀 /// </summary> /// <param name="strurl"></param> /// <returns></returns> private static string getdomain(string strurl) { string retval; string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; regex r = new regex(strregex, regexoptions.ignorecase); match m = r.match(strurl); retval = m.tostring(); strregex = @"\.|/$"; retval = regex.replace(retval, strregex, "").tostring(); if (retval == "") retval = "other"; return retval; } /// <summary> /// 获取标题 /// </summary> /// <param name="html"></param> /// <returns></returns> private static string gettitle(string html) { string titlefilter = @"<title>[\s\s]*?</title>"; string h1filter = @"<h1.*?>.*?</h1>"; string clearfilter = @"<.*?>"; string title = ""; match match = regex.match(html, titlefilter, regexoptions.ignorecase); if (match.success) { title = regex.replace(match.groups[0].value, clearfilter, ""); } // 正文的标题一般在h1中,比title中的标题更干净 match = regex.match(html, h1filter, regexoptions.ignorecase); if (match.success) { string h1 = regex.replace(match.groups[0].value, clearfilter, ""); if (!string.isnullorempty(h1) && title.startswith(h1)) { title = h1; } } return title; }
这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: C#微信开发第一章