python爬虫爬取海量病毒文件

程序员文章站 2022-05-04 14:08:34

...

因为工作需要，需要做深度学习识别恶意二进制文件，所以爬一些资源。

# -*- coding: utf-8 -*-
import requests
import re
import sys
import logging

reload(sys)
sys.setdefaultencoding('utf-8')

logger = logging.getLogger("rrjia")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler("/home/rrjia/Python/test.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel("INFO")


if __name__ == '__main__':
    # url = 'http://malwaredb.malekal.com'
    # http://malwaredb.malekal.com/index.php?page=1
    # <td width="30px" align="center"><a href="./files.php?file=25e8bf41343bda75a9170aad44094647"><img src="img/tetedemort.gif" width="26px,height=26px"></a></td>

    count = 1
    error_count = 0

    begin_url = 'http://malwaredb.malekal.com'
    begin_html = requests.get(begin_url)

    img_src = re.findall('<a href="\./files\.php\?file=\w+">', begin_html.text, re.S)
    imgUrl = []
    for each_src in img_src:
        arr = each_src.split("=")
        imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
    logger.info("0 page contains %d virus file" % len(imgUrl))
    for each in imgUrl:
        try:
            imgContext = requests.get(each, timeout=120).content
            fileName = each.split("=")[1]
            with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
                code.write(imgContext)
                code.close()
            logger.info("success download %d page %d file " % (0, count) + str(fileName) + ".zip")
        except Exception as e:
            error_count += 1
            logger.info("this url error download failed")
        count += 1


#    for page in range(1, 828):
#        url = "http://malwaredb.malekal.com/index.php?page=" + str(page)
#        html = requests.get(url)
#        img_src = re.findall('<a href="\./files\.php\?file=\w+">', html.text, re.S)
#        imgUrl = []
#        for each_src in img_src:
#            arr = each_src.split("=")
#            imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace('"', "").replace(">", ""))
#        logger.info("%d page contains %d virus file" % (page, len(imgUrl)))
#        for each in imgUrl:
#            try:
#                imgContext = requests.get(each, timeout=120).content
#                fileName = each.split("=")[1]
#                with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code:
#                    code.write(imgContext)
#                    code.close()
#                logger.info("success download %d page %d file " % (page, count) + str(fileName) + ".zip")
#            except Exception as e:
#                error_count += 1
#                logger.info("this url error")
#            count += 1

上一篇： python读写音频文件小结

下一篇：敏捷开发的6个实战经验

python爬虫爬取海量病毒文件

Python实现爬取亚马逊数据并打印出Excel文件操作示例

详解用python写网络爬虫-爬取新浪微博评论

Python实现爬取知乎神回复简单爬虫代码分享

Python网络爬虫（selenium爬取动态网页、爬虫案例分析、哈希算法与RSA加密）

Python爬虫实战用 BeautifulSoup 爬取电影网站信息

python爬取网页转换为PDF文件

python爬取基于m3u8协议的ts文件并合并

python爬虫爬取奇书阁首页分类小说数据

python爬虫爬取豆瓣top排行图片

利用python爬虫爬取斗鱼图片(简单详细)