C# HtmlAgilityPack+Selenium爬取需要拉动滚动条的页面内容

程序员文章站 2022-05-03 15:06:10

现在大多数网站都是随着滚动条的滑动加载页面内容的，因此单纯获得静态页面的Html是无法获得全部的页面内容的。使用Selenium就可以模拟浏览器拉动滑动条来加载所有页面内容。 ......

现在大多数网站都是随着滚动条的滑动加载页面内容的，因此单纯获得静态页面的html是无法获得全部的页面内容的。使用selenium就可以模拟浏览器拉动滑动条来加载所有页面内容。

前情提要

c#htmlagilitypack爬取静态页面

selenium简介

selenium是一个web自动化测试工具。selenium测试直接运行在浏览器中，就像真正的用户在操作一样。支持的浏览器包括ie（7, 8, 9, 10, 11），mozilla firefox，safari，google chrome，opera等。主要功能包括：测试与浏览器的兼容性——测试你的应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成 .net、java、perl等不同语言的测试脚本。selenium也是一款同样使用apache license 2.0协议发布的开源框架。

c#安装selenium

本文仅仅是使用selenium实现拉动滚动条的功能，所以不对selenium进行过多的介绍。
通过nuget包管理器搜索"selenium"，分别安装:

selenium.webdriver
selenium.chrome.webdriver

实例(获取某网站主页所有图片)

普通获取网页html

chromedriver driver = new chromedriver();
driver.navigate().gotourl(url);
string title = driver.title;//页面title
string html = driver.pagesource;//页面html

不启动chrome窗口及关闭chrome控制台获取网页

程序执行时会自动打开chrome窗口和输出控制台中一些信息，我们不需要这些东西。

//不启动chrome窗口
chromeoptions options = new chromeoptions();
options.addargument("headless");

//关闭chromedriver控制台
chromedriverservice driverservice = chromedriverservice.createdefaultservice();
driverservice.hidecommandpromptwindow = true;

chromedriver driver = new chromedriver(driverservice, options);
driver.navigate().gotourl(url);

将页面滚动到底部

如果使用scrollto(0, document.body.scrollheight)，直接让将页面滚动到底部会导致页面中间部分读取失败，所以需要分几次滑动并且给页面足够的时间加载

for (int i = 1; i <= 10; i++)
{
    string jscode = "window.scrollto({top: document.body.scrollheight / 10 * " + i + ", behavior: \"smooth\"});";
    //使用ijavascriptexecutor接口运行js代码
    ijavascriptexecutor js = (ijavascriptexecutor)driver;
    js.executescript(jscode);
    //暂停滚动
    thread.sleep(1000);
}

使用htmlagilitypack解析读取到的html

以下内容与上一篇文章基本相同

string title = driver.title;//页面title
string html = driver.pagesource;//页面html

htmldocument doc = new htmldocument();
doc.loadhtml(html);//解析html字符串
string imgpath = "//img";//选择img
//获取img标签中的图片
foreach (htmlnode node in doc.documentnode.selectnodes(imgpath))
{
    ······
}

完整代码

using system;
using system.collections.generic;
using system.linq;
using system.text;
using system.threading.tasks;
using system.net;
using system.io;
using htmlagilitypack;
using system.text.regularexpressions;
using openqa.selenium;
using openqa.selenium.chrome;
using system.threading;

namespace webcrawlerdemo
{
    class program
    {
        static void main(string[] args)
        {
            webclient wc = new webclient();

            int imgnum = 0;//图片编号
            string url = "https://www.bilibili.com";


            string html = finalhtml.getfinalhtml(url, 10);

            htmldocument doc = new htmldocument();
            doc.loadhtml(html);

            string imgpath = "//img";//选择img

            //htmlnode nodes = hd.documentnode.selectsinglenode(path);

            //获取img标签中的图片
            foreach (htmlnode node in doc.documentnode.selectnodes(imgpath))
            {
                if (node.attributes["src"] != null)
                {
                    string imgurl = node.attributes["src"].value.tostring();
                    if (imgurl != "" && imgurl != " ")
                    {
                        imgnum++;

                        //生成文件名，自动获取后缀
                        string filename = getimgname(imgurl, imgnum);

                        //console.writeline(filename);
                        //console.writeline(imgurl);
                        imgdownloader.downloadimg(wc, imgurl, "images/", filename);
                    }
                }
            }
            //获取背景图
            string bgimgpath = "//*[@style]";//选择具有style属性的节点
            foreach (htmlnode node in doc.documentnode.selectnodes(bgimgpath))
            {
                if (node.attributes["style"].value.contains("background-image:url"))
                {
                    imgnum++;
                    string bgimgurl = node.attributes["style"].value;
                    bgimgurl = regex.match(bgimgurl, @"(?<=\().+?(?=\))").value;//读取url()的内容
                    //console.writeline(bgimgurl);
                    //生成文件名，自动获取后缀
                    string filename = getimgname(bgimgurl, imgnum);

                    imgdownloader.downloadimg(wc, bgimgurl, "images/bgcimg/", filename);
                }
            }
            console.writeline("----------end----------");
            console.writeline($"一共获得: {imgnum}张图");
            console.readkey();
        }
    }
    /// <summary>
    /// 图片下载器
    /// </summary>
    public class imgdownloader
    {
        /// <summary>
        /// 下载图片
        /// </summary>
        /// <param name="webclient"></param>
        /// <param name="url">图片url</param>
        /// <param name="folderpath">文件夹路径</param>
        /// <param name="filename">图片名</param>
        public static void downloadimg(webclient webclient, string url, string folderpath, string filename)
        {
            //如果文件夹不存在，则创建一个
            if (!directory.exists(folderpath))
            {
                directory.createdirectory(folderpath);
            }
            //判断路径是否完整，补全不完整的路径
            if (url.indexof("https:") == -1 && url.indexof("http:") == -1)
            {
                url = "https:" + url;
            }
            //下载图片
            try
            {
                webclient.downloadfile(url, folderpath + filename);
                console.writeline(filename + "下载成功");
            }
            catch (exception ex)
            {
                console.write(ex.message);
                console.writeline(url);
            }
        }
        /// <summary>
        /// 生成图片名称
        /// </summary>
        /// <param name="imageurl">图片地址</param>
        /// <param name="imagenum">图片编号</param>
        /// <returns></returns>
        public static string getimgname(string imageurl, int imagenum)
        {
            string imgextension;
            if (imageurl.lastindexof(".") != -1)
            {
                imgextension = imageurl.substring(imageurl.lastindexof("."));
            }
            else
            {
                imgextension = ".jpg";
            }
            return imagenum + imgextension;
        }
    }
    /// <summary>
    /// 获得执行过js的网址
    /// </summary>
    public class finalhtml
    {
        /// <summary>
        /// 获得拉动滚动条后的页面
        /// </summary>
        /// <param name="url">网址</param>
        /// <param name="sectionnum">滚动几次</param>
        /// <returns>html字符串</returns>
        public static string getfinalhtml(string url, int sectionnum)
        {
            //不启动chrome窗口
            chromeoptions options = new chromeoptions();
            options.addargument("headless");

            //关闭chromedriver控制台
            chromedriverservice driverservice = chromedriverservice.createdefaultservice();
            driverservice.hidecommandpromptwindow = true;


            chromedriver driver = new chromedriver(driverservice, options);

            driver.navigate().gotourl(url);

            string title = driver.title;
            console.writeline($"title: {title}");
            //将页面滚动到底部
            console.write("页面滚动中，请稍后");

            for (int i = 1; i <= sectionnum; i++)
            {
                string jscode = "window.scrollto({top: document.body.scrollheight / " + sectionnum + " * " + i + ", behavior: \"smooth\"});";
                ijavascriptexecutor js = (ijavascriptexecutor)driver;
                js.executescript(jscode);
                console.write(".");
                thread.sleep(1000);
            }
            console.writeline();

            string html = driver.pagesource;
            driver.quit();

            return html;
        }
    }
}

参考文章

selenium爬取网页实战

上一篇： 06.前端之BootStrap

下一篇：什么？羊奶不能乱喝？什么人不能喝羊奶