python爬取cnvd漏洞库信息的实例

程序员文章站 2023-12-01 23:12:46

今天一同事需要整理http://ics.cnvd.org.cn/工控漏洞库里面的信息，一看960多个要整理到什么时候才结束。所以我决定写个爬虫帮他抓取数据。看了一下...

今天一同事需要整理http://ics.cnvd.org.cn/工控漏洞库里面的信息，一看960多个要整理到什么时候才结束。

所以我决定写个爬虫帮他抓取数据。

看了一下各类信息还是很规则的，感觉应该很好写。

but这个网站设置了各种反爬虫手段。

经过各种百度，还是解决问题了。

设计思路：

1.先抓取每一个漏洞信息对应的网页url

2.获取每个页面的漏洞信息

# -*- coding: utf-8 -*-
import requests
import re
import xlwt
import time
from bs4 import beautifulsoup
headers = {
 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 'accept-encoding': 'gzip, deflate, sdch',
 'accept-language': 'zh-cn,zh;q=0.8',
 'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/54.0.2840.71 safari/537.36'
}
cookies={'__jsluid':'8d3f4c75f437ca82cdfad85c0f4f7c25'}
myfile=xlwt.workbook()
wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=true)
j = 0
a = 900
for i in range(4):
 url ="http://ics.cnvd.org.cn/?max=20&offset="+str(a)
 r = requests.get(urttp://ics.cnvd.org.cnl,headers=headers,cookies=cookies)
 print r.status_code
 while r.status_code != 200:
  r = requests.get(url,headers=headers,cookies=cookies)
  print r.status_code
 html = r.text
 soup = beautifulsoup(html)
 #print html
 
 for tag in soup.find('tbody',id='tr').find_all('a',href=re.compile('http://www.cnvd.org.cn/flaw/show')):
  print tag.attrs['href']
  wtable.write(j,0,tag.attrs['href'])
  j += 1
 a += 20
 print u"已完成%s"%(a)
filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"
myfile.save(filename)
print u"完成%s的url备份"%time.strftime('%y%m%d%h%m%s',time.localtime())

# -*- coding: utf-8 -*-
from selenium import webdriver
import xlrd
import xlwt
from selenium.webdriver.common.by import by
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support.ui import select
from selenium.common.exceptions import nosuchelementexception
from selenium.common.exceptions import noalertpresentexception
import unittest, time, re
 
class gk(unittest.testcase):
 def setup(self):
  self.driver = webdriver.firefox()
  self.driver.implicitly_wait(5)
  self.verificationerrors = []
  self.accept_next_alert = true
 
 def test_gk(self):
  myfile=xlwt.workbook()
  wtable=myfile.add_sheet(u"info",cell_overwrite_ok=true)
  data = xlrd.open_workbook('url.xlsx')
  table = data.sheets()[0]
  nrows = table.nrows
  driver = self.driver
  
  j = 0
  for i in range(nrows):
   try:
    s = []
    driver.get(table.cell(i,0).value)
    title = driver.find_element_by_xpath("//h1").text
    print title
    s.append(title)
    trs = driver.find_element_by_xpath("//tbody").find_elements_by_tag_name('tr')
    for td in trs:
     tds = td.find_elements_by_tag_name("td")
     for tt in tds:
      print tt.text
      s.append(tt.text)
    k = 0
    for info in s:
     wtable.write(j,k,info)
     k += 1
    j += 1
   except:
    filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"
    myfile.save(filename)
    print u"异常自动保存%s的漏洞信息备份"%time.strftime('%y%m%d%h%m%s',time.localtime())
   
  filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"
  myfile.save(filename)
  print u"完成%s的漏洞信息备份"%time.strftime('%y%m%d%h%m%s',time.localtime())
  
 def is_element_present(self, how, what):
  try: self.driver.find_element(by=how, value=what)
  except nosuchelementexception, e: return false
  return true
 
 def is_alert_present(self):
  try: self.driver.switch_to_alert()
  except noalertpresentexception, e: return false
  return true
 
 def close_alert_and_get_its_text(self):
  try:
   alert = self.driver.switch_to_alert()
   alert_text = alert.text
   if self.accept_next_alert:
    alert.accept()
   else:
    alert.dismiss()
   return alert_text
  finally: self.accept_next_alert = true
 
 def teardown(self):
  self.driver.quit()
  self.assertequal([], self.verificationerrors)
 
if __name__ == "__main__":
 unittest.main()

好了。看看结果怎样！

python爬取cnvd漏洞库信息的实例

ok！剩下手动整理一下，收工！

以上这篇python爬取cnvd漏洞库信息的实例就是小编分享给大家的全部内容了，希望能给大家一个参考，也希望大家多多支持。

上一篇： PHP和MySql中32位和64位的整形范围是多少

下一篇： python pexpect ssh 远程登录服务器的方法

python爬取cnvd漏洞库信息的实例

python爬取cnvd漏洞库信息的实例

Python 爬取携程所有机票的实例代码

Python爬取豆瓣电影信息遇到的问题

python爬取本站电子书信息并入库的实现代码

Python爬取数据并写入MySQL数据库的实例

python requests爬取高德地图数据的实例

几行Python代码爬取3000+上市公司的信息

Python爬虫使用selenium爬取qq群的成员信息（全自动实现自动登陆）

Python爬取租房数据实例，据说可以入门爬虫的小案例！

Python爬虫实例：爬取B站《工作细胞》短评——异步加载信息的爬取