欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫爬取海量病毒文件

程序员文章站 2022-05-04 14:08:34
...

因为工作需要,需要做深度学习识别恶意二进制文件,所以爬一些资源。

# -*- coding: utf-8 -*-
import requests
import re
import sys
import logging

reload(sys)
sys.setdefaultencoding('utf-8')

logger = logging.getLogger("rrjia")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler("/home/rrjia/Python/test.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel("INFO")


if __name__ == '__main__':
    # url = 'http://malwaredb.malekal.com'
    # http://malwaredb.malekal.com/index.php?page=1
    # <td width="30px" align="center"><a href="./files.php?file=25e8bf41343bda75a9170aad44094647"><img src="img/tetedemort.gif" width="26px,height=26px"></a></td>

    count = 1
    error_count = 0

    begin_url = 'http://malwaredb.malekal.com'
    begin_html = requests.get(begin_url)

    img_src = re.findall('<a href="\./files\.php\?file=\w+">', begin_html.text, re.S)
    imgUrl = []
    for each_src in img_src:
        arr = each_src.split("=")
        imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
    logger.info("0 page contains %d virus file" % len(imgUrl))
    for each in imgUrl:
        try:
            imgContext = requests.get(each, timeout=120).content
            fileName = each.split("=")[1]
            with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
                code.write(imgContext)
                code.close()
            logger.info("success download %d page %d file " % (0, count) + str(fileName) + ".zip")
        except Exception as e:
            error_count += 1
            logger.info("this url error download failed")
        count += 1


#    for page in range(1, 828):
#        url = "http://malwaredb.malekal.com/index.php?page=" + str(page)
#        html = requests.get(url)
#        img_src = re.findall('<a href="\./files\.php\?file=\w+">', html.text, re.S)
#        imgUrl = []
#        for each_src in img_src:
#            arr = each_src.split("=")
#            imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
#        logger.info("%d page contains %d virus file" % (page, len(imgUrl)))
#        for each in imgUrl:
#            try:
#                imgContext = requests.get(each, timeout=120).content
#                fileName = each.split("=")[1]
#                with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
#                    code.write(imgContext)
#                    code.close()
#                logger.info("success download %d page %d file " % (page, count) + str(fileName) + ".zip")
#            except Exception as e:
#                error_count += 1
#                logger.info("this url error")
#            count += 1