因为工作需要,需要做深度学习识别恶意二进制文件,所以爬一些资源。
# -*- coding: utf-8 -*-
import requests
import re
import sys
import logging
reload(sys)
sys.setdefaultencoding('utf-8')
logger = logging.getLogger("rrjia")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler("/home/rrjia/Python/test.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel("INFO")
if __name__ == '__main__':
# url = 'http://malwaredb.malekal.com'
# http://malwaredb.malekal.com/index.php?page=1
# <td width="30px" align="center"><a href="./files.php?file=25e8bf41343bda75a9170aad44094647"><img src="img/tetedemort.gif" width="26px,height=26px"></a></td>
count = 1
error_count = 0
begin_url = 'http://malwaredb.malekal.com'
begin_html = requests.get(begin_url)
img_src = re.findall('<a href="\./files\.php\?file=\w+">', begin_html.text, re.S)
imgUrl = []
for each_src in img_src:
arr = each_src.split("=")
imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
logger.info("0 page contains %d virus file" % len(imgUrl))
for each in imgUrl:
try:
imgContext = requests.get(each, timeout=120).content
fileName = each.split("=")[1]
with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
code.write(imgContext)
code.close()
logger.info("success download %d page %d file " % (0, count) + str(fileName) + ".zip")
except Exception as e:
error_count += 1
logger.info("this url error download failed")
count += 1
# for page in range(1, 828):
# url = "http://malwaredb.malekal.com/index.php?page=" + str(page)
# html = requests.get(url)
# img_src = re.findall('<a href="\./files\.php\?file=\w+">', html.text, re.S)
# imgUrl = []
# for each_src in img_src:
# arr = each_src.split("=")
# imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
# logger.info("%d page contains %d virus file" % (page, len(imgUrl)))
# for each in imgUrl:
# try:
# imgContext = requests.get(each, timeout=120).content
# fileName = each.split("=")[1]
# with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
# code.write(imgContext)
# code.close()
# logger.info("success download %d page %d file " % (page, count) + str(fileName) + ".zip")
# except Exception as e:
# error_count += 1
# logger.info("this url error")
# count += 1