python爬取英语学习资料并发送邮件
程序员文章站
2023-11-11 17:17:40
新建发送邮件类 爬取英语学习资料 比如爬取英语学习链接:http://www.hjenglish.com/new/c1020/,将当前页文章爬取到并发送邮件到指定邮箱: ......
新建发送邮件类
import smtplib from email.mime.text import mimetext from email.header import header class sendmail: def __init__(self): self.sender = 'xx@qq.com' self.receivers = ['xx1@qq.com','xx2@qq.com'] # 接收邮件,可设置为你的qq邮箱或者其他邮箱 self.smtp_server = 'smtp.qq.com' self.smtp_pwd = 'xx' self.stmp_port = 25 def sendmessage(self, title, msg): # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码 message = mimetext(msg, 'plain', 'utf-8') message['from'] = self.sender message['to'] = ';'.join(self.receivers) message['subject'] = header(title, 'utf-8') smtpobj = smtplib.smtp(self.smtp_server, self.stmp_port) smtpobj.starttls() smtpobj.login(self.sender, self.smtp_pwd) smtpobj.sendmail(self.sender, self.receivers, message.as_string()) print('success')
爬取英语学习资料
比如爬取英语学习链接:http://www.hjenglish.com/new/c1020/,将当前页文章爬取到并发送邮件到指定邮箱:
from bs4 import beautifulsoup import time, os import xlwt import requests import datetime import threading import schedule from mymodule.sendmail import * def getlinks(url): try: res = requests.get(url, headers={'host': 'www.hjenglish.com', 'referer':'http://www.hjenglish.com/new/cet/', 'user-agent':'mozilla/5.0 (macintosh; intel mac os x 10_13_6) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36'}) res.raise_for_status() page = beautifulsoup(res.text, 'lxml') res.close() links =['http://www.hjenglish.com' + adom['href'] for adom in page.select('.big-link.title-article')] return links except exception as err: print(err) def spiderlink(url, lock): print('当前线程', threading.currentthread().getname()) res = requests.get(url, headers={'host': 'www.hjenglish.com', 'referer':'http://www.hjenglish.com/new/cet/', 'user-agent':'mozilla/5.0 (macintosh; intel mac os x 10_13_6) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36'}) if res.status_code == 200: try: page = beautifulsoup(res.text, "lxml") res.close() titles = [title.gettext() for title in page.select('.article-header .title')] contents = [contentdom.gettext() for contentdom in page.select('#j-article-content')] # print(titles, contents) num = len(titles) global total print(titles, contents) for i in range(0, num): lock.acquire() total = total + 1 lock.release() print(titles[i], contents[i]) sender.sendmessage(titles[i], contents[i]) except exception as err: print(err) else: pass def my_job(): try: starttime = datetime.datetime.now() url = 'http://www.hjenglish.com/new/c1020/' lock = threading.lock() spider_links = getlinks(url) threads = [threading.thread(target=spiderlink, args=(link, lock)) for link in spider_links] for thread1 in threads: thread1.start() for thread2 in threads: thread2.join() endtime = datetime.datetime.now() print('have spend ', str((endtime - starttime).seconds) + 's') global total total = 0 except exception as err: print(err) os._exit(0) if __name__ == '__main__': try: sender = sendmail() total = 0 my_job() except exception as err: print(err) os._exit(0)