Python IMAP/POP3收取并解析邮件
MIME邮件格式
Return-Path: <aaa@qq.com>
Delivered-To: ***@**
Received: from m13-61.163.com (EHLO m13-61.163.com) ([220.181.13.61])
by VM_0_5_centos (JAMES SMTP Server ) with ESMTP ID -1461210387
for <aaa@qq.com>;
Fri, 14 Dec 2018 15:35:46 +0800 (CST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
s=s110527; h=Date:From:Subject:MIME-Version:Message-ID; bh=LGN55
QdgU9XoPQCr+27/Z+IPGOdDWiYZ7swbWaIAMzM=; b=P3wXeY5gFeuR5YVOBMga4
OHrnfwD+rYOKLUpTuaupgr/d/JftXBkufTMp8tIloy1njIPwt4Vp7oiJHAGT1gSi
k48Fk7CP30e6E8pXk4+LUfVaOinunqbdGgVzkmtkZnZu4U4X/vhRIxNLICB8kTSu
khVfkoXrWroSS2Q6HHAwpQ=
Received: from XXXX$163.com ( [59.83.198.133] ) by
ajax-webmail-wmsvr61 (Coremail) ; Fri, 14 Dec 2018 15:35:45 +0800 (CST)
X-Originating-IP: [59.83.198.133]
Date: Fri, 14 Dec 2018 15:35:45 +0800 (CST)
From: 1 <aaa@qq.com>
To: "aaa@qq.com" <aaa@qq.com>
Subject: kjchjkch
X-Priority: 3
X-Mailer: Coremail Webmail Server Version SP_ntes V3.5 build
20160729(86883.8884) Copyright (c) 2002-2018 www.mailtech.cn 163com
X-CM-CTRLDATA: xShnzGZvb3Rlcl9odG09MTAwOjU2
Content-Type: multipart/mixed;
boundary="----=_Part_183728_654166346.1544772945612"
MIME-Version: 1.0
Message-ID: <aaa@qq.com>
X-Coremail-Locale: zh_CN
X-CM-TRANSID:PcGowACHOSlRXRNcmW4QAA--.464W
X-CM-SenderInfo: 1k1rijqruwmliuz6il2tof0z/1tbiDwUdOVUMJg-i4gACsc
X-Coremail-Antispam: 1U5529EdanIXcx71UUUUU7vcSsGvfC2KfnxnUU==
------=_Part_183728_654166346.1544772945612
Content-Type: multipart/alternative;
boundary="----=_Part_183730_484598546.1544772945613"
------=_Part_183730_484598546.1544772945613
Content-Type: text/plain; charset=GBK
Content-Transfer-Encoding: base64
c2tkaGFza2pkaDG72ODB1/fPsrPJuaYx
------=_Part_183730_484598546.1544772945613
Content-Type: text/html; charset=GBK
Content-Transfer-Encoding: base64
PGRpdiBzdHlsZT0ibGluZS1oZWlnaHQ6MS43O2NvbG9yOiMwMDAwMDA7Zm9udC1zaXplOjE0cHg7
Zm9udC1mYW1pbHk6QXJpYWwiPnNrZGhhc2tqZGgxu9jgwdf3z7KzybmmMTwvZGl2Pjxicj48YnI+
PHNwYW4gdGl0bGU9Im5ldGVhc2Vmb290ZXIiPjxwPiZuYnNwOzwvcD48L3NwYW4+
------=_Part_183730_484598546.1544772945613--
------=_Part_183728_654166346.1544772945612
Content-Type: text/plain; name="=?GBK?Q?=B2=E2=CA=D422.txt?="
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="=?GBK?Q?=B2=E2=CA=D422.txt?="
vM6088/Dv9W85Lu5yse087/Nu6e94r72DQo=
------=_Part_183728_654166346.1544772945612--
这是一封邮件的原始数据,每一封邮件都有From,To,Date等字段,传输编码有两种,Base64和Quoted-printable。目前MIME邮件中的数据编码普遍采用Base64编码或Quoted-printable编码来实现。
Base64编码:
Base64编码的目的是将输入的数据全部转换成由64 个指定ASCII字符组成的字符序列, 这64个字符由{‘A’-‘Z’, ‘a’-‘z’, ‘0’-‘9’, ‘+’, ‘/’}构成。编码时将需要转换的数据每次取出6bit,然后将其转换成十进制数字,这个数字的范围最小为0,最大为63,然后查询{‘A’-‘Z’, ‘a’-‘z’, ‘0’-‘9’, ‘+’, ‘/’}构成的字典表,输出对应位置的ASCII码字符,这样每3个字节的数据内容会被转换成4个字典中的ASCII码字符,当转换到数据末尾不足3个字节时,则用“=”来填充。
Quoted-printable编码:
Quoted-printable编码的目的也是将输入的信息转换成可打印的ASCII码字符,但它是根据信息的内容来决定是否进行编码,如果读入的字节处于33-60、62-126范围内的,这些都是可直接打印的ASCII字符,则直接输出,如果不是,则将该字节分为两个4bit,每个用一个16进制数字来表示,然后在前面加“=”,这样每个需要编码的字节会被转换成三个字符来表示。
boundary="----=_Part_183728_654166346.1544772945612" 这是定义一个标识分割符,结束的时候加入 --,而在这个范围内可以申明其他的标识。结束标识与定义的时候是相对应的,而中间你有多个部分(段)都没什么关系。抬出邮件格式图对应起来看就知道了:
POP3代码
def receive_email_pop3(self):
try:
email_server = poplib.POP3_SSL(host=self.server_host, port=self.server_port, timeout=10)
print("connect server success............")
except:
print("connect timeout...................")
return
try:
email_server.user(self.email_address)
except:
print("sorry the given email address seem does not exist")
return
try:
email_server.pass_(self.auth_code)
print("password correct,now will list email")
except:
print("sorry the given username does not seem correct")
return
emailMsgNum,emailSize=email_server.stat()
print('email number is %d and size is %d'%(emailMsgNum, emailSize))
# print('list is {}'.format(email_server.list()))
email_count = len(email_server.list()[1])
time.sleep(2)
# 遍历所有邮件
for i in range(1,email_count+1):
try:
contents= email_server.retr(i)[1]
email_content = b'\r\n'.join(contents)
# email_content = email_content.decode('utf-8')
try:
# msg=email.message_from_bytes(email_content)
# date=msg.get('date')
# mail_info.ReceiveDate=self.get_time_stamp(date)
self.parse_emial(email_content)
except:
print(email_content)
print('decode email failed.....................')
# email_content = email_content.decode('utf-8','ignore')
# msg = Parser().parsestr(email_content)
except Exception as e:
print(e)
print('pop3 unknow error..............................')
continue
# 关闭连接
email_server.close()
IMAP代码
也可以用imapclient进行获取
def receive_email_imap(self):
server = imaplib.IMAP4_SSL(port = self.server_port,host = self.server_host)
if server is not None:
res=server.login(self.email_address,self.auth_code)
print(res)
if res!='':
mail_boxes=[]
boxes=server.list()[1]
for l in boxes:
mail_boxes.append(l.decode('utf-8').rsplit('"/"')[1])
# mail_boxes=mail_boxes[1:]
for box in mail_boxes:
if box.lstrip()=='"&UXZO1mWHTvZZOQ-"':
continue
server.select('{}'.format(box.lstrip()),readonly=True)
data=server.search(None,'ALL')[1]
# print('data is {}'.format(data))
email_count = len(data[0].split())
total+=email_count
for box in mail_boxes:
# print(box)
try:
server.select('{}'.format(box.lstrip()),readonly=True)
unseen=server.search(None,'UNSEEN')[1]
unseen_list=unseen[0].split()
# Recent Seen Answered Flagged Deleted Draft
data=server.search(None,'ALL')[1]
email_count = len(data[0].split())
print('email count is {}'.format(email_count))
# email_count
for i in range(0,email_count):
try:
latest_email_uid = data[0].split()[i]
try:
email_data = server.fetch(latest_email_uid, '(RFC822)')[1]
raw_email = email_data[0][1]
except Exception as ex:
print('imap fetch unknow error.............')
try:
self.parse_emial(raw_email)
except Exception as ec:
# print(raw_email)
print(ec)
print('decode email failed.....................')
except Exception as e:
print(e)
print('imap unknow error..............................')
continue
except:
pass
server.logout()
邮件解析
本次解析采用了一个eml_parser的模块儿进行解析,也可采用emial模块进行解析,但是效果不是特别好,有很多东西没法解析。
def parse_emial(self,msg):
eml=eml_parser.eml_parser.decode_email_b(msg,include_raw_body=True,include_attachment_data=True)
# raw_size=len(msg)/1024
raw_size=len(msg)
header = eml.get('header')
body = eml.get('body') # type: list
attachments = eml.get('attachment')
# 密送是bcc,不一定能获取
# header['to'] 应该也可以读取到数据
to=[]
try:
to=header['header']['to']
except:
try:
to=header['to']
except:
to=[]
# from
from_=header.get('from')
subject=header.get('subject')
# 获取抄送
cc = header.get('cc')
# 保存路径
res_dir=self.frame.get_task_res_path()
atta=self.parse_attachment(attachments,res_dir)
# localtimestamp=0
# try:
# localtimestamp=int(time.mktime(header['date'].timetuple()))
# except:
# print('date format error......{}'.format(header['date']))
# else:
# mail_info.ReceiveDate=localtimestamp
# def get_eml_content(self,mail_content):
# """分析content字段"""
# if mail_content is None:
# return
# if mail_content is not None:
# if len(mail_content) > 14 and 'html' in str(mail_content[:14]).lower():
# mail_info.Html=mail_content
# else:
# mail_info.Text=mail_content.replace('\r\n', ' ')
def parse_attachment(self,attachments, save_path):
size=0
file_list=[]
if attachments is None:
return [], size
for file in attachments:
file_name = file.get('filename')
file_size = file.get('size')
size+=file_size
# content_header = file.get('content_header') # type: dict
# save_path=os.path.join(save_path,file_name)
file_name = self.save_attachment_file(file.get('raw'), save_path, file.get('filename'))
# 保存所有文件列表
file_list.append(file_name)
return file_list, size
def save_attachment_file(self, raw, save_path,file_name):
try:
raw = base64.b64decode(raw)
file_path=os.path.join(save_path,file_name)
with open(file_path, 'wb') as fp:
fp.write(raw)
except Exception as e:
print('解码出错,{}'.format(e))
return file_path
邮箱文件夹
这里的 “&UXZO1mWHTvZZOQ-”是IMAP-UTF7编码,可以用imapclient模块儿进行解码。
注意事项
-授权码需要自己配置,一般第三方登录都是采用授权码,尤其是163,为了推广邮箱大师,授权码登录也会被拦截,还需要另外设置,网上有个链接地址:http://config.mail.163.com/settings/imap/aaa@qq.com,设置之后是不会在收到拦截邮件的,但奇怪的是Foxmail是不会被拦截的(暂时不太明白为什么会这样,有知道的小伙伴可以留言告知下)