基于python爬虫的github-exploitdb漏洞库监控与下载
基于python爬虫的github-exploitdb漏洞库监控与下载
offensive.py(爬取项目历史更新内容)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import time
import urllib.request
import conf as cf
base_url = 'https://github.com/offensive-security/exploitdb/releases'
download_link_pattern = 'href="(.*?)zip" rel="nofollow">'
first_pattern = r'</span><a rel="nofollow" href="(.*?)">next.*'
page_pattern = r'>previous</a><a rel="nofollow" href="(.*?)">next.*'
class mycrawler:
def __init__(self, base_url=base_url, start_page="first 1 page"):
self.base_url = base_url
self.start_page = start_page
# self.headers = apache_request_headers();
# 对首页的爬取
def first_page(self):
try:
req = urllib.request.request(self.base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(first_pattern, doc, re.m | re.i)
print('now working on page = {}\n'.format(self.start_page))
time.sleep(5)
self.fetch_download_link(self.base_url)
self.start_page = next_page.group(1)
# re.search(r'after = (.*?) ">next.*', next_page.group(1), re.m | re.i).group(1)
self.base_url = next_page.group(1)
# self.fetch_download_link(next_url)
except urllib.error.httperror as err:
print(err.msg)
self.fetch_next_page()
# 翻页
def fetch_next_page(self):
while true:
try:
req = urllib.request.request(self.base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(page_pattern, doc, re.m | re.i)
print('now working on page {}\n'.format(self.start_page))
time.sleep(5)
#翻页时等待5秒
self.fetch_download_link(self.base_url)
self.start_page = next_page.group(1)
# re.search(r'after = (.*?) ">next.*', next_page.group(1), re.m | re.i).group(1)
self.base_url = next_page.group(1)
# self.fetch_download_link(next_url)
except urllib.error.httperror as err:
print(err.msg)
break
# 文件下载:将下载链接存到文件中
def fetch_download_link(self, aurl):
f = open('result.txt', 'a')
req = urllib.request.request(aurl)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8')
alist = list(set(re.findall(download_link_pattern, doc)))
for item in alist:
url = "https://github.com/" + item + "zip"
print('storing {}'.format(url))
f.write(url + '\n')
time.sleep(7)
f.close()
def run(self):
self.fetch_download_link()
if __name__ == '__main__':
mc = mycrawler()
mc.first_page()
text.py(监控首页更新,并爬取)
#!/usr/bin/env python
# -*- coding:utf-8 -*
from selenium import webdriver
import re
import time
import urllib.request
import conf as cf
base_url = 'https://github.com/offensive-security/exploitdb/releases'
download_link_pattern = 'href="(.*?)zip" rel="nofollow">'
first_pattern = r'</span><a rel="nofollow" href="(.*?)">next.*'
# 监控项目首页更新
def jiankong_page():
print("star monitoring ")
req = urllib.request.request(base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(first_pattern, doc, re.m | re.i)
flag_page = next_page.group(1)
flag_list = []
# 首次抓取首页项目url
alist = list(set(re.findall(download_link_pattern, doc)))
for item in alist:
url = "https://github.com/" + item + "zip"
flag_list.append(url)
# 定时扫描监控(5h/次)
while true:
try:
time.sleep(5 * 60* 60)
req = urllib.request.request(base_url)
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8', 'ignore')
next_page = re.search(first_pattern, doc, re.m | re.i)
# 判断翻页链接是否变化,来确定是否更新
if next_page.group(1) != flag_page:
print("have update")
item = re.rearch(download_link_pattern, doc, re.m | re.i)
#抓取第一个匹配的 刚更新的项目url
new_url = "https://github.com/" + item.group(1) + "zip"
print("new url = " + new_url)
flag_list.append(new_url)
f = open('result.txt', 'a')
f.write(new_url + '\n')
f.close()
flag_page = next_page.group(1)
else:
print("no update")
except urllib.error.httperror as err:
print(err.msg)
break
if __name__ == '__main__':
jiankong_page()
介绍一下我自己吧,我是fisher,互联网安全作者一枚,日常是分享有趣的安全技术与故事,当然也会记录学习之路的收获。对安全领域感兴趣,可以关注我的个人微信公众号:austfish。不想走丢的话,请关注【fisher的安全日记】!(别忘了加星标哦)or 个人博客: