.NET Core 实现定时抓取网站文章并发送到邮箱

程序员文章站 2022-07-05 16:28:56

前言大家好，我是晓晨。许久没有更新博客了，今天给大家带来一篇干货型文章，一个每隔5分钟抓取博客园首页文章信息并在第二天的上午9点发送到你的邮箱的小工具。比如我在2018...

前言

大家好，我是晓晨。许久没有更新博客了，今天给大家带来一篇干货型文章，一个每隔5分钟抓取博客园首页文章信息并在第二天的上午9点发送到你的邮箱的小工具。比如我在2018年2月14日，9点来到公司我就会收到一封邮件，是2018年2月13日的博客园首页的文章信息。写这个小工具的初衷是，一直有看博客的习惯，但是最近由于各种原因吧，可能几天都不会看一下博客，要是中途错过了什么好文可是十分心疼的哈哈。所以做了个工具，每天归档发到邮箱，妈妈再也不会担心我错过好的文章了。为什么只抓取首页？因为博客园首页文章的质量相对来说高一些。

准备

作为一个持续运行的工具，没有日志记录怎么行，我准备使用的是nlog来记录日志，它有个日志归档功能非常不错。在http请求中，由于网络问题吧可能会出现失败的情况，这里我使用polly来进行retry。使用htmlagilitypack来解析网页，需要对xpath有一定了解。下面是详细说明：

组件名	用途	github
nlog	记录日志	https://github.com/nlog/nlog
polly	当http请求失败，进行重试	https://github.com/app-vnext/polly
htmlagilitypack	网页解析
mailkit	发送邮件	https://github.com/jstedfast/mailkit

有不了解的组件，可以通过访问github获取资料。

参考文章

获取&解析博客园首页数据

我是用的是httpwebrequest来进行http请求，下面分享一下我简单封装的类库：

using system;
using system.io;
using system.net;
using system.text;

namespace cnblogsubscribetool
{
 /// <summary>
 /// simple http request class
 /// .net framework >= 4.0
 /// author:stulzq
 /// createdtime:2017-12-12 15:54:47
 /// </summary>
 public class httputil
 {
 static httputil()
 {
  //set connection limit ,default limit is 2
  servicepointmanager.defaultconnectionlimit = 1024;
 }

 /// <summary>
 /// default timeout 20s
 /// </summary>
 public static int defaulttimeout = 20000;

 /// <summary>
 /// is auto redirect
 /// </summary>
 public static bool defalutallowautoredirect = true;

 /// <summary>
 /// default encoding
 /// </summary>
 public static encoding defaultencoding = encoding.utf8;

 /// <summary>
 /// default useragent
 /// </summary>
 public static string defaultuseragent =
  "mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/62.0.3202.94 safari/537.36"
  ;

 /// <summary>
 /// default referer
 /// </summary>
 public static string defaultreferer = "";

 /// <summary>
 /// httpget request
 /// </summary>
 /// <param name="url">internet address</param>
 /// <returns>string</returns>
 public static string getstring(string url)
 {
  var stream = getstream(url);
  string result;
  using (streamreader sr = new streamreader(stream))
  {
  result = sr.readtoend();
  }
  return result;

 }

 /// <summary>
 /// httppost request
 /// </summary>
 /// <param name="url">internet address</param>
 /// <param name="postdata">post request data</param>
 /// <returns>string</returns>
 public static string poststring(string url, string postdata)
 {
  var stream = poststream(url, postdata);
  string result;
  using (streamreader sr = new streamreader(stream))
  {
  result = sr.readtoend();
  }
  return result;

 }

 /// <summary>
 /// create response
 /// </summary>
 /// <param name="url"></param>
 /// <param name="post">is post request</param>
 /// <param name="postdata">post request data</param>
 /// <returns></returns>
 public static webresponse createresponse(string url, bool post, string postdata = "")
 {
  var httpwebrequest = webrequest.createhttp(url);
  httpwebrequest.timeout = defaulttimeout;
  httpwebrequest.allowautoredirect = defalutallowautoredirect;
  httpwebrequest.useragent = defaultuseragent;
  httpwebrequest.referer = defaultreferer;
  if (post)
  {

  var data = defaultencoding.getbytes(postdata);
  httpwebrequest.method = "post";
  httpwebrequest.contenttype = "application/x-www-form-urlencoded;charset=utf-8";
  httpwebrequest.contentlength = data.length;
  using (var stream = httpwebrequest.getrequeststream())
  {
   stream.write(data, 0, data.length);
  }
  }

  try
  {
  var response = httpwebrequest.getresponse();
  return response;
  }
  catch (exception e)
  {
  throw new exception(string.format("request error,url:{0},ispost:{1},data:{2},message:{3}", url, post, postdata, e.message), e);
  }
 }

 /// <summary>
 /// http get request
 /// </summary>
 /// <param name="url"></param>
 /// <returns>response stream</returns>
 public static stream getstream(string url)
 {
  var stream = createresponse(url, false).getresponsestream();
  if (stream == null)
  {

  throw new exception("response error,the response stream is null");
  }
  else
  {
  return stream;

  }
 }

 /// <summary>
 /// http post request
 /// </summary>
 /// <param name="url"></param>
 /// <param name="postdata">post data</param>
 /// <returns>response stream</returns>
 public static stream poststream(string url, string postdata)
 {
  var stream = createresponse(url, true, postdata).getresponsestream();
  if (stream == null)
  {

  throw new exception("response error,the response stream is null");
  }
  else
  {
  return stream;

  }
 }


 }
}

获取首页数据

string res = httputil.getstring(https://www.cnblogs.com);

.NET Core 实现定时抓取网站文章并发送到邮箱

解析数据

我们成功获取到了html，但是怎么提取我们需要的信息（文章标题、地址、摘要、作者、发布时间）呢。这里就亮出了我们的利剑htmlagilitypack，他是一个可以根据xpath来解析网页的组件。

载入我们前面获取的html：

htmldocument doc = new htmldocument();
doc.loadhtml(html);

.NET Core 实现定时抓取网站文章并发送到邮箱

从上图中，我们可以看出，每条文章所有信息都在一个class为post_item的div里，我们先获取所有的class=post_item的div

//获取所有文章数据项
var itembodys = doc.documentnode.selectnodes("//div[@class='post_item_body']");

我们继续分析，可以看出文章的标题在class=post_item_body的div下面的h3标签下的a标签，摘要信息在class=post_item_summary的p标签里面，发布时间和作者在class=post_item_foot的div里，分析完毕，我们可以取出我们想要的数据了：

foreach (var itembody in itembodys)
{
 //标题元素
 var titleelem = itembody.selectsinglenode("h3/a");
 //获取标题
 var title = titleelem?.innertext;
 //获取url
 var url = titleelem?.attributes["href"]?.value;

 //摘要元素
 var summaryelem = itembody.selectsinglenode("p[@class='post_item_summary']");
 //获取摘要
 var summary = summaryelem?.innertext.replace("\r\n", "").trim();

 //数据项底部元素
 var footelem = itembody.selectsinglenode("div[@class='post_item_foot']");
 //获取作者
 var author = footelem?.selectsinglenode("a")?.innertext;
 //获取文章发布时间
 var publishtime = regex.match(footelem?.innertext, "\\d+-\\d+-\\d+ \\d+:\\d+").value;
 console.writeline($"标题：{title}");
 console.writeline($"网址：{url}");
 console.writeline($"摘要：{summary}");
 console.writeline($"作者：{author}");
 console.writeline($"发布时间：{publishtime}");
 console.writeline("--------------华丽的分割线---------------");
}

运行一下：

.NET Core 实现定时抓取网站文章并发送到邮箱

我们成功的获取了我们想要的信息。现在我们定义一个blog对象将它们装起来。

public class blog
{
 /// <summary>
 /// 标题
 /// </summary>
 public string title { get; set; }

 /// <summary>
 /// 博文url
 /// </summary>
 public string url { get; set; }

 /// <summary>
 /// 摘要
 /// </summary>
 public string summary { get; set; }

 /// <summary>
 /// 作者
 /// </summary>
 public string author { get; set; }

 /// <summary>
 /// 发布时间
 /// </summary>
 public datetime publishtime { get; set; }
}

http请求失败重试

我们使用polly在我们的http请求失败时进行重试，设置为重试3次。

//初始化重试器
_retrytwotimespolicy =
 policy
 .handle<exception>()
 .retry(3, (ex, count) =>
 {
  _logger.error("excuted failed! retry {0}", count);
  _logger.error("exeption from {0}", ex.gettype().name);
 });

测试一下：

.NET Core 实现定时抓取网站文章并发送到邮箱

可以看到当遇到exception是polly会帮我们重试三次，如果三次重试都失败了那么会放弃。

发送邮件

使用mailkit来进行邮件发送，它支持imap，pop3和smtp协议，并且是跨平台的十分优秀。下面是根据前面园友的分享自己封装的一个类库：

using system.collections.generic;
using cnblogsubscribetool.config;
using mailkit.net.smtp;
using mimekit;

namespace cnblogsubscribetool
{
 /// <summary>
 /// send email
 /// </summary>
 public class mailutil
 {
 private static bool sendmail(mimemessage mailmessage,mailconfig config)
 {
  try
  {
  var smtpclient = new smtpclient();
  smtpclient.timeout = 10 * 1000; //设置超时时间
  smtpclient.connect(config.host, config.port, mailkit.security.securesocketoptions.none);//连接到远程smtp服务器
  smtpclient.authenticate(config.address, config.password);
  smtpclient.send(mailmessage);//发送邮件
  smtpclient.disconnect(true);
  return true;

  }
  catch
  {
  throw;
  }

 }

 /// <summary>
 ///发送邮件
 /// </summary>
 /// <param name="config">配置</param>
 /// <param name="receives">接收人</param>
 /// <param name="sender">发送人</param>
 /// <param name="subject">标题</param>
 /// <param name="body">内容</param>
 /// <param name="attachments">附件</param>
 /// <param name="filename">附件名</param>
 /// <returns></returns>
 public static bool sendmail(mailconfig config,list<string> receives, string sender, string subject, string body, byte[] attachments = null,string filename="")
 {
  var frommailaddress = new mailboxaddress(config.name, config.address);
  
  var mailmessage = new mimemessage();
  mailmessage.from.add(frommailaddress);
  
  foreach (var add in receives)
  {
  var tomailaddress = new mailboxaddress(add);
  mailmessage.to.add(tomailaddress);
  }
  if (!string.isnullorempty(sender))
  {
  var replyto = new mailboxaddress(config.name, sender);
  mailmessage.replyto.add(replyto);
  }
  var bodybuilder = new bodybuilder() { htmlbody = body };

  //附件
  if (attachments != null)
  {
  if (string.isnullorempty(filename))
  {
   filename = "未命名文件.txt";
  }
  var attachment = bodybuilder.attachments.add(filename, attachments);

  //解决中文文件名乱码
  var charset = "gb18030";
  attachment.contenttype.parameters.clear();
  attachment.contentdisposition.parameters.clear();
  attachment.contenttype.parameters.add(charset, "name", filename);
  attachment.contentdisposition.parameters.add(charset, "filename", filename);

  //解决文件名不能超过41字符
  foreach (var param in attachment.contentdisposition.parameters)
   param.encodingmethod = parameterencodingmethod.rfc2047;
  foreach (var param in attachment.contenttype.parameters)
   param.encodingmethod = parameterencodingmethod.rfc2047;
  }

  mailmessage.body = bodybuilder.tomessagebody();
  mailmessage.subject = subject;

  return sendmail(mailmessage, config);

 }
 }
}

测试一下：

.NET Core 实现定时抓取网站文章并发送到邮箱