csharp: using HtmlAgilityPack and ScrapySharp reading Url find text
程序员文章站
2022-06-25 16:58:10
https://github.com/exaphaser/ScrapySharp https://github.com/zzzprojects/html-agility-pack https://github.com/atifaziz/Fizzler https://archive.codeplex ......
https://github.com/exaphaser/ScrapySharp
https://github.com/zzzprojects/html-agility-pack
https://github.com/atifaziz/Fizzler
https://archive.codeplex.com/?p=fizzlerex
https://github.com/aspnet/blazor
https://github.com/SteveSanderson/Blazor
https://www.mathjax.org/#samples 数学公式
https://github.com/robinvanderknaap/MvcJqGrid
http://www.defenseinnovationmarketplace.mil/strategy.html
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.IO; using System.Net; using System.Collections; using ScrapySharp; using ScrapySharp.Network; using ScrapySharp.Core; using HtmlAgilityPack; namespace HtmlAgilityPackDemo { /// <summary> /// HTML解析利器HtmlAgilityPack /// geovindu /// 涂聚文 /// 20180305 /// </summary> public partial class Form1 : Form { public Form1() { InitializeComponent(); } /// <summary> /// /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void Form1_Load(object sender, EventArgs e) { this.textBox1.Text = "ln"; //List<CityList> lis=new List<CityList>(); } /// <summary> /// /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetWebClient(string url) { string strHTML = ""; WebClient myWebClient = new WebClient(); Stream myStream = myWebClient.OpenRead(url); StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码 strHTML = sr.ReadToEnd(); myStream.Close(); return strHTML; } /// <summary> /// nl /// </summary> /// <param name="cityCode"></param> public string ParsePageByArea(String cityCode, out List<CityList> listcity) { StringBuilder stp = new StringBuilder(); CityList city = null; List<CityList> clits = new List<CityList>(); //更加链接格式和省份代码构造URL String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode); //下载网页源代码 var docText = GetWebClient(url); //加载源代码,获取文档对象 var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(docText); //更加xpath获取总的对象,如果不为空,就继续选择dl标签 var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]"); if (res != null) { var list = res.SelectNodes(@"dl");//选择标签数组 if (list.Count < 1) { listcity = clits; return ""; } foreach (var item in list) { var dd = item.SelectSingleNode(@"dd").SelectNodes("a"); foreach (var node in dd) { city = new CityList(); var text = node.InnerText.Trim(); //拼音代码要从href属性中进行分割提取 var herf = node.Attributes["href"].Value.Trim().Split('/', '.'); string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]); city.CityName = text; city.CityCode = herf[herf.Length - 2]; stp.Append("\r\n" + str); clits.Add(city); } } } listcity = clits; return stp.ToString(); } /// <summary> /// http://www.tianqihoubao.com/lishi/dalian/month/201802.html /// </summary> /// <param name="cityCode"></param> /// <param name="year"></param> /// <param name="month"></param> public string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea) { StringBuilder stp = new StringBuilder(); List<WeatherList> wlist = new List<WeatherList>(); WeatherList wt = null; //更加拼音代码,月份信息构造URL String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month); //获取该链接的源代码 var docText = GetWebClient(url); //加载源代码,获取页面结构对象 var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(docText); //更加Xpath获取表格对象 var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]"); if (res != null) { //获取所有行 var list = res.SelectNodes(@"tr"); list.RemoveAt(0);//移除第一行,是表头 // 遍历每一行,获取日期,以及天气状况等信息 foreach (var item in list) { wt = new WeatherList(); var dd = item.SelectNodes(@"td"); //日期 - - 气温 - 风力风向 if (dd.Count != 4) continue; //获取当前行日期 var date1 = dd[0].InnerText.Replace("\r\n", "").Replace(" ", "").Trim(); //获取当前行天气状况 var tq = dd[1].InnerText.Replace("\r\n", "").Replace(" ", "").Trim(); //获取当前行气温 var qw = dd[2].InnerText.Replace("\r\n", "").Replace(" ", "").Trim(); //获取当前行风力风向 var fx = dd[3].InnerText.Replace("\r\n", "").Replace(" ", "").Trim(); //输出 string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx); stp.Append(str); wt.Climate = tq; wt.Date =DateTime.Parse(date1); wt.Temperature = qw; wt.WindDirection = fx; wlist.Add(wt); } } wea = wlist; return stp.ToString(); } /// <summary> /// http://www.dusystem.com/geovindu.html /// ScrapingBrowser /// 获取文件标题 /// </summary> /// <param name="url"></param> /// <returns></returns> public string getHtmlTitle(string url) { StringBuilder titl = new StringBuilder(); var uri = new Uri(url); var browser1 = new ScrapingBrowser(); var html1 = browser1.DownloadString(uri); var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html1); var html = doc.DocumentNode; var title = html.SelectNodes("title"); foreach (var htmlNode in title) { titl.Append(htmlNode.InnerText); } //CssSelect CssSelectAncestors var ps = html.SelectNodes("p").Elements("div#endText"); foreach (var htmlNode in ps) { titl.Append(htmlNode.InnerHtml); } return titl.ToString(); } /// <summary> /// /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void button1_Click(object sender, EventArgs e) { List<CityList> list = new List<CityList>(); this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list); this.comboBox1.DataSource = list; this.comboBox1.DisplayMember = "CityName"; this.comboBox1.ValueMember = "CityCode"; } /// <summary> /// /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void button2_Click(object sender, EventArgs e) { List<WeatherList> list = new List<WeatherList>(); int year=DateTime.Now.Year; int mont=DateTime.Now.Month-1; this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list); this.dataGridView1.DataSource = list; } } /// <summary> /// /// </summary> public class CityList { /// <summary> /// /// </summary> public string CityName { get; set; } /// <summary> /// /// </summary> public string CityCode { get; set; } } /// <summary> /// Climate, temperature, wind direction /// </summary> public class WeatherList { /// <summary> /// 气候 /// </summary> public string Climate { get; set; } /// <summary> /// 温度 /// </summary> public string Temperature { get; set; } /// <summary> /// 风向 /// </summary> public string WindDirection { get; set; } /// <summary> /// /// </summary> public DateTime Date { get; set; } } }
private void button3_Click(object sender, EventArgs e) { int year = DateTime.Now.Year; int mont = DateTime.Now.Month - 1; string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html"; var docText = GetWebClient(url); HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(docText); // document.OptionOutputAsXml = true; var divname = document.DocumentNode.Descendants("div").FirstOrDefault(); var body = document.DocumentNode.SelectNodes("//body").Single(); var ta = document.DocumentNode.SelectNodes("//table").Single(); foreach (var script in document.DocumentNode.Descendants("script").ToArray()) script.Remove(); foreach (var style in document.DocumentNode.Descendants("style").ToArray()) style.Remove(); // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray()) // comment.Remove();//新增的代码 //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li"); //是示天气的 List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList(); string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value; // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name //XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr"); //b: is class name 根据XPath查找节点,跟XmlNode差不多 HtmlNode node = document.DocumentNode.SelectSingleNode("//*"); IEnumerable<HtmlNode> nodeList = node.Ancestors(); //获取该元素所有的父节点的集合 foreach (HtmlNode item in nodeList) { Console.Write(item.Name + " "); //输出 div div body html #document } HtmlAttributeCollection attrs = node.Attributes; foreach (var item in attrs) { Console.WriteLine(item.Name + " : " + item.Value); //输出 class :user_match clear } HtmlNodeCollection CNodes = node.ChildNodes; //所有的子节点 foreach (HtmlNode item in CNodes) { Console.WriteLine(item.Name + "-" + item.InnerText); //输出 别忘了文本节点也算 } HtmlAttributeCollection attrs1 = node.ClosingAttributes; //获取在结束标记的 HTML 属性的集合。 例如</ul class=""> Console.WriteLine(attrs1.Count); //输出0 HtmlNode node1 = node.FirstChild; //悲剧了ul的第一个节点是一个 \n 换行文本节点 第二个节点才到第一个li Console.WriteLine(node1.NodeType); //输出Text 文本节点 HtmlNode node3 = node.LastChild; //同样最后一个节点一样是 \n 文本节点 Console.WriteLine(node3.NodeType); //输出Text 文本节点 HtmlNode node2 = node.SelectSingleNode("child::div[1]"); //获取当前节点的第一个子li节点 Console.WriteLine(node2.XPath); //根据节点生成XPath表达式 /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] Console.WriteLine(node.HasAttributes); //输出 True 判断节点是否含有属性 Console.WriteLine(node.HasChildNodes); //输出 True 判断节点是否含有子节点 Console.WriteLine(node.HasClosingAttributes); //False 判断节点结束标记是否含有属性 Console.WriteLine(node.Line); //输出 155 该节点开始标记位于页面代码的第几行 Console.WriteLine(node.LinePosition); //输出 1 该节点开始标记位于第几列2 Console.WriteLine(node.NodeType); //输出 Element 该节点类型 此处为元素节点 Console.WriteLine(node.OriginalName); //输出 ul HtmlNode node4 = node.SelectSingleNode("child::div[1]"); Console.WriteLine(node4.InnerText); //输出 HtmlNode node5 = node4.NextSibling.NextSibling; //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点 Console.WriteLine(node5.InnerText); //输出 HtmlNode node6 = node5.PreviousSibling.PreviousSibling; //同样两次以跳过换行文本节点 Console.WriteLine(node6.InnerText); //输出 HtmlNode node7 = node6.ParentNode; //获取父节点 Console.WriteLine(node7.Name); //输出 ul string str = node.OuterHtml; Console.WriteLine(str); //输出整个ul代码class="user_match clear"> Console.WriteLine(node.StreamPosition); //输出7331 获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。 HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument; foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div")) { var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value; if (classValue == "first") { //write innerText into a table at place [i][column1] } else if (classValue == "second") { //write innerText into the same table in [i][column2] } } string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText; }