欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

csharp: using HtmlAgilityPack and ScrapySharp reading Url find text

程序员文章站 2022-03-24 16:20:48
https://github.com/exaphaser/ScrapySharp https://github.com/zzzprojects/html-agility-pack https://github.com/atifaziz/Fizzler https://archive.codeplex ......

https://github.com/exaphaser/ScrapySharp

https://github.com/zzzprojects/html-agility-pack

https://github.com/atifaziz/Fizzler

https://archive.codeplex.com/?p=fizzlerex

https://github.com/aspnet/blazor

https://github.com/SteveSanderson/Blazor

https://www.mathjax.org/#samples 数学公式

 

https://github.com/robinvanderknaap/MvcJqGrid

http://www.defenseinnovationmarketplace.mil/strategy.html

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Collections;
using ScrapySharp;
using ScrapySharp.Network;
using ScrapySharp.Core;
using HtmlAgilityPack;


namespace HtmlAgilityPackDemo
{

    /// <summary>
    /// HTML解析利器HtmlAgilityPack
    /// geovindu
    /// 涂聚文
    /// 20180305
    /// </summary>
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Form1_Load(object sender, EventArgs e)
        {
            this.textBox1.Text = "ln";

            //List<CityList> lis=new List<CityList>();
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetWebClient(string url)
        {
            string strHTML = "";
            WebClient myWebClient = new WebClient();
            Stream myStream = myWebClient.OpenRead(url);
            StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码
            strHTML = sr.ReadToEnd();
            myStream.Close();
            return strHTML;
        }

        /// <summary>
        /// nl
        /// </summary>
        /// <param name="cityCode"></param>
        public  string ParsePageByArea(String cityCode, out List<CityList> listcity)
        {
            StringBuilder stp = new StringBuilder();
            CityList city = null;
            List<CityList> clits = new List<CityList>();
            //更加链接格式和省份代码构造URL
            String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode);
            //下载网页源代码 
            var docText = GetWebClient(url);
            //加载源代码,获取文档对象
            var doc = new HtmlAgilityPack.HtmlDocument(); 
            doc.LoadHtml(docText);
            //更加xpath获取总的对象,如果不为空,就继续选择dl标签
            var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]");
            if (res != null)
            {
                var list = res.SelectNodes(@"dl");//选择标签数组
                if (list.Count < 1)
                {
                    listcity = clits;
                    return "";
                }
                foreach (var item in list)
                {
                    var dd = item.SelectSingleNode(@"dd").SelectNodes("a");
                    foreach (var node in dd)
                    {
                        city = new CityList();
                        var text = node.InnerText.Trim();
                        //拼音代码要从href属性中进行分割提取
                        var herf = node.Attributes["href"].Value.Trim().Split('/', '.');
                       string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]);
                       city.CityName = text;
                       city.CityCode = herf[herf.Length - 2];
                       stp.Append("\r\n" + str);
                       clits.Add(city);

                    }
                }                
            }
            listcity = clits;
            return stp.ToString();
        }
        /// <summary>
        /// http://www.tianqihoubao.com/lishi/dalian/month/201802.html
        /// </summary>
        /// <param name="cityCode"></param>
        /// <param name="year"></param>
        /// <param name="month"></param>
        public  string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea)
        {
            StringBuilder stp = new StringBuilder();
            List<WeatherList> wlist = new List<WeatherList>();
            WeatherList wt = null;
            //更加拼音代码,月份信息构造URL
            String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month);
            //获取该链接的源代码
            var docText = GetWebClient(url);
            //加载源代码,获取页面结构对象
            var doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(docText);
            //更加Xpath获取表格对象
            var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]");
            if (res != null)
            {
                //获取所有行
                var list = res.SelectNodes(@"tr");
                list.RemoveAt(0);//移除第一行,是表头
                // 遍历每一行,获取日期,以及天气状况等信息
                foreach (var item in list)
                {
                    wt = new WeatherList();
                    var dd = item.SelectNodes(@"td");
                    //日期 -  - 气温 - 风力风向
                    if (dd.Count != 4) continue;
                    //获取当前行日期
                    var date1 = dd[0].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行天气状况
                    var tq = dd[1].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行气温
                    var qw = dd[2].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行风力风向
                    var fx = dd[3].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //输出
                    string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx);
                    stp.Append(str);
                    wt.Climate = tq;
                    wt.Date =DateTime.Parse(date1);
                    wt.Temperature = qw;
                    wt.WindDirection = fx;
                    wlist.Add(wt);

                }
            }
            wea = wlist;
            return stp.ToString();
        }
        /// <summary>
        /// http://www.dusystem.com/geovindu.html
        /// ScrapingBrowser
        /// 获取文件标题
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public string getHtmlTitle(string url)
        {
            StringBuilder titl = new StringBuilder();
            var uri = new Uri(url);
            var browser1 = new ScrapingBrowser();
            var html1 = browser1.DownloadString(uri);
            var doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(html1);
            var html = doc.DocumentNode;

            var title = html.SelectNodes("title");
            foreach (var htmlNode in title)
            {
                titl.Append(htmlNode.InnerText);
            }
                //CssSelect  CssSelectAncestors
            var ps = html.SelectNodes("p").Elements("div#endText");
            foreach (var htmlNode in ps)
            {
                titl.Append(htmlNode.InnerHtml);
            }

           return titl.ToString();

        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            List<CityList> list = new List<CityList>();
            this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list);
            this.comboBox1.DataSource = list;
            this.comboBox1.DisplayMember = "CityName";
            this.comboBox1.ValueMember = "CityCode";


        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button2_Click(object sender, EventArgs e)
        {
            List<WeatherList> list = new List<WeatherList>();
            int year=DateTime.Now.Year;
            int mont=DateTime.Now.Month-1;
            this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list);
            this.dataGridView1.DataSource = list;
            
        }

    }
    /// <summary>
    /// 
    /// </summary>
    public class CityList
    {
        /// <summary>
        /// 
        /// </summary>
        public string CityName { get; set; }
        /// <summary>
        /// 
        /// </summary>
        public string CityCode { get; set; }
    }

    /// <summary>
    /// Climate, temperature, wind direction
    /// </summary>
    public class WeatherList
    {
        /// <summary>
        /// 气候
        /// </summary>
        public string Climate { get; set; }
        /// <summary>
        /// 温度
        /// </summary>
        public string Temperature { get; set; }
        /// <summary>
        /// 风向
        /// </summary>
        public string WindDirection { get; set; }
        /// <summary>
        /// 
        /// </summary>
        public DateTime Date { get; set; }
    }
   
}

  

  private void button3_Click(object sender, EventArgs e)
        {
            int year = DateTime.Now.Year;
            int mont = DateTime.Now.Month - 1;
            string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html";
            var docText = GetWebClient(url);
            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();           
            
            document.LoadHtml(docText);

           // document.OptionOutputAsXml = true;

            var divname = document.DocumentNode.Descendants("div").FirstOrDefault();

            var body = document.DocumentNode.SelectNodes("//body").Single();

            var ta = document.DocumentNode.SelectNodes("//table").Single();

            foreach (var script in document.DocumentNode.Descendants("script").ToArray())
                script.Remove();
            foreach (var style in document.DocumentNode.Descendants("style").ToArray())
                style.Remove();

           // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray())
            //    comment.Remove();//新增的代码

            //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li");
            //是示天气的
            List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();

            string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value;


           // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name
            //XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1]
            HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr");     //b: is class name 根据XPath查找节点,跟XmlNode差不多

            

            HtmlNode node = document.DocumentNode.SelectSingleNode("//*");



            IEnumerable<HtmlNode> nodeList = node.Ancestors();  //获取该元素所有的父节点的集合
            foreach (HtmlNode item in nodeList)
            {
                Console.Write(item.Name + " ");   //输出 div div body html #document
            }

            HtmlAttributeCollection attrs = node.Attributes;
            foreach (var item in attrs)
            {
                Console.WriteLine(item.Name + " : " + item.Value);    //输出 class :user_match clear
            }

            HtmlNodeCollection CNodes = node.ChildNodes;    //所有的子节点
            foreach (HtmlNode item in CNodes)
            {
                Console.WriteLine(item.Name + "-" + item.InnerText);  //输出 别忘了文本节点也算
            }

            HtmlAttributeCollection attrs1 = node.ClosingAttributes;    //获取在结束标记的 HTML 属性的集合。  例如</ul class="">
            Console.WriteLine(attrs1.Count);    //输出0

            HtmlNode node1 = node.FirstChild;   //悲剧了ul的第一个节点是一个 \n 换行文本节点 第二个节点才到第一个li
            Console.WriteLine(node1.NodeType);  //输出Text 文本节点
            HtmlNode node3 = node.LastChild;    //同样最后一个节点一样是 \n 文本节点
            Console.WriteLine(node3.NodeType);  //输出Text 文本节点

            HtmlNode node2 = node.SelectSingleNode("child::div[1]");     //获取当前节点的第一个子li节点
            Console.WriteLine(node2.XPath);     //根据节点生成XPath表达式   /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] 

            Console.WriteLine(node.HasAttributes);          //输出 True   判断节点是否含有属性
            Console.WriteLine(node.HasChildNodes);          //输出 True   判断节点是否含有子节点
            Console.WriteLine(node.HasClosingAttributes);   //False     判断节点结束标记是否含有属性

            Console.WriteLine(node.Line);           //输出 155  该节点开始标记位于页面代码的第几行
            Console.WriteLine(node.LinePosition);   //输出 1   该节点开始标记位于第几列2
            Console.WriteLine(node.NodeType);       //输出 Element   该节点类型 此处为元素节点            
            Console.WriteLine(node.OriginalName);   //输出 ul
            HtmlNode node4 = node.SelectSingleNode("child::div[1]");
            Console.WriteLine(node4.InnerText);     //输出 
            HtmlNode node5 = node4.NextSibling.NextSibling;     //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点
            Console.WriteLine(node5.InnerText);     //输出 
            HtmlNode node6 = node5.PreviousSibling.PreviousSibling;     //同样两次以跳过换行文本节点
            Console.WriteLine(node6.InnerText);     //输出 
            HtmlNode node7 = node6.ParentNode;      //获取父节点
            Console.WriteLine(node7.Name);          //输出 ul
            string str = node.OuterHtml;
            Console.WriteLine(str);     //输出整个ul代码class="user_match clear">
            Console.WriteLine(node.StreamPosition); //输出7331    获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。

            HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument;

            foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div"))
            {
                var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value;

                if (classValue == "first")
                {
                    //write innerText into a table at place [i][column1]
                }
                else if (classValue == "second")
                {
                    //write innerText into the same table in [i][column2]
                }
            }

            string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText;
        }