C# HtmlAgilityPack爬取静态页面
程序员文章站
2022-05-03 15:08:40
最近对爬虫很感兴趣,稍微研究了一下,利用HtmlAgilityPack制作了一个十分简单的爬虫,这个简易爬虫只能获取静态页面的Html ......
最近对爬虫很感兴趣,稍微研究了一下,利用htmlagilitypack制作了一个十分简单的爬虫,这个简易爬虫只能获取静态页面的html
htmlagilitypack简介
htmlagilitypack是一个解析速度十分快,并且开源的html解析工具,并且htmlagilitypack支持使用xpath解析html,能够帮助我们解析html文档就像解析xml文档一样轻松、方便。
c#安装htmlagilitypack
- 如果vs安装有nuget,在nuget直接搜索安装即可。
- 下载后解压缩后有3个文件,这里只需要将其中的htmlagilitypack.dll、htmlagilitypack.xml引入解决方案中即可使用
实例(获取某页面图片)
加载html页面
//从网页中加载 string url = "https://www.bilibili.com"; htmlweb web = new htmlweb(); htmldocument hd = web.load(url);
利用webclient写一个图片下载器
需要using system.net
和using system.io
/// <summary> /// 图片下载器 /// </summary> public class imgdownloader { /// <summary> /// 下载图片 /// </summary> /// <param name="webclient"></param> /// <param name="url">图片url</param> /// <param name="folderpath">文件夹路径</param> /// <param name="filename">图片名</param> public static void downloadimg(webclient webclient, string url, string folderpath, string filename) { //如果文件夹不存在,则创建一个 if (!directory.exists(folderpath)) { directory.createdirectory(folderpath); } //判断路径是否完整,补全不完整的路径 if (url.indexof("https:") == -1 && url.indexof("http:") == -1) { url = "https:" + url; } //下载图片 try { webclient.downloadfile(url, folderpath + filename); console.writeline(filename + "下载成功"); } catch (exception ex) { console.write(ex.message); console.writeline(url); } } }
通过xpath获取img标签中的图片
string imgpath = "//img";//选择img int imgnum = 0;//图片编号 //获取img标签中的图片 foreach (htmlnode node in hd.documentnode.selectnodes(imgpath)) { if (node.attributes["src"] != null) { string imgurl = node.attributes["src"].value.tostring(); if (imgurl != "" && imgurl != " ") { imgnum++; //生成文件名,自动获取后缀 string filename = imgnum + imgurl.substring(imgurl.lastindexof(".")); imgdownloader.downloadimg(wc, imgurl, "images/", filename); } } }
通过xpath获取背景图
//获取背景图 string bgimgpath = "//*[@style]";//选择具有style属性的节点 foreach (htmlnode node in hd.documentnode.selectnodes(bgimgpath)) { if (node.attributes["style"].value.contains("background-image:url")) { imgnum++; string bgimgurl = node.attributes["style"].value; bgimgurl = regex.match(bgimgurl, @"(?<=\().+?(?=\))").value;//读取url()的内容 //console.writeline(bgimgurl); //生成文件名,自动获取后缀 string filename = imgnum + bgimgurl.substring(bgimgurl.lastindexof(".")); imgdownloader.downloadimg(wc, bgimgurl, "images/bgcimg/", filename); } }
完整代码
using system.linq; using system.text; using system.threading.tasks; using system.net; using system.io; using htmlagilitypack; using system.text.regularexpressions; namespace webcrawlerdemo { class program { static void main(string[] args) { webclient wc = new webclient(); string url = "https://www.bilibili.com"; htmlweb web = new htmlweb(); htmldocument hd = web.load(url);//下载html页面 string imgpath = "//img";//选择img int imgnum = 0;//图片编号 //获取img标签中的图片 foreach (htmlnode node in hd.documentnode.selectnodes(imgpath)) { if (node.attributes["src"] != null) { string imgurl = node.attributes["src"].value.tostring(); if (imgurl != "" && imgurl != " ") { imgnum++; //生成文件名,自动获取后缀 string filename = imgnum + imgurl.substring(imgurl.lastindexof(".")); imgdownloader.downloadimg(wc, imgurl, "images/", filename); } } } //获取背景图 string bgimgpath = "//*[@style]";//选择具有style属性的节点 foreach (htmlnode node in hd.documentnode.selectnodes(bgimgpath)) { if (node.attributes["style"].value.contains("background-image:url")) { imgnum++; string bgimgurl = node.attributes["style"].value; bgimgurl = regex.match(bgimgurl, @"(?<=\().+?(?=\))").value;//读取url()的内容 //生成文件名,自动获取后缀 string filename = imgnum + bgimgurl.substring(bgimgurl.lastindexof(".")); imgdownloader.downloadimg(wc, bgimgurl, "images/bgcimg/", filename); } } console.writeline("----------end----------"); console.readkey(); } } /// <summary> /// 图片下载器 /// </summary> public class imgdownloader { /// <summary> /// 下载图片 /// </summary> /// <param name="webclient"></param> /// <param name="url">图片url</param> /// <param name="folderpath">文件夹路径</param> /// <param name="filename">图片名</param> public static void downloadimg(webclient webclient, string url, string folderpath, string filename) { //如果文件夹不存在,则创建一个 if (!directory.exists(folderpath)) { directory.createdirectory(folderpath); } //判断路径是否完整,补全不完整的路径 if (url.indexof("https:") == -1 && url.indexof("http:") == -1) { url = "https:" + url; } //下载图片 try { webclient.downloadfile(url, folderpath + filename); console.writeline(filename + "下载成功"); } catch (exception ex) { console.write(ex.message); console.writeline(url); } } } }