下载糗事百科的内容_python版
程序员文章站
2022-05-12 19:42:59
复制代码 代码如下:#coding:utf-8 import urllib.request import xml.dom.minidom import sqlite3 im...
复制代码 代码如下:
#coding:utf-8
import urllib.request
import xml.dom.minidom
import sqlite3
import threading
import time
class logger(object):
def log(self,*msg):
for i in msg:
print(i)
log = logger()
log.log('测试下')
class downloader(object):
def __init__(self,url):
self.url = url
def download(self):
log.log('开始下载',self.url)
try:
content = urllib.request.urlopen(self.url).read()
#req = urllib.request.request(url)
#response = urllib.request.urlopen(req)
#content = response.read()
log.log('下载完毕')
return(content)
except:
log.log('下载出错')
return(none)
class parser(object):
def __init__(self,content):
#获得根节点
self.html = xml.dom.minidom.parsestring(content)
def parse(self):
log.log('开始提取数据')
contents = {'content':'','url':[]}
#获得div节点
divs = self.html.getelementsbytagname('div')
#获得content节点
for div in divs:
if div.hasattribute('class') and \
div.getattribute('class') == 'content':
#获得糗事百科的内容
textnode = div.childnodes[0]
qcontent = textnode.data
#数据填充
contents['content'] = qcontent
#获得上一糗事、下一糗事节点
spans = self.html.getelementsbytagname('span')
for span in spans:
pspan = span.parentnode
if pspan.tagname == 'a':
#pspan为对应的链接,此时需要将对应的地址加入数据库
url = pspan.getattribute('href')
qid = url[10:][:-4]
#数据填充
contents['url'].append(qid)
log.log('提取数据完毕')
return(contents)
def downloadpage(qid,db):
url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
content = downloader(url).download()
if content:
contents = parser(content).parse()
if contents['content']:
db.updatecontent(qid,contents['content'])
for i in contents['url']:
db.addqid(i)
if len(contents['url']) == 2:
db.updatestatus(qid,2)
#下载池,表示同时允许下载的链接个数
class downloaderpool(object):
def __init__(self,maxlength=15):
self.downloaders = [none]*maxlength
self.downloadlist = []
self.db = none
def setdownloadlist(self,downloadlist):
self.downloadlist = list(set(self.downloadlist+downloadlist))
def setdb(self,db):
self.db = db
def daemon(self):
#每隔一秒查询线程的状态,为非活动线程则设置为none
log.log('设置守护进程')
for index,downloader in enumerate(self.downloaders):
if downloader:
if not downloader.isalive():
log.log('将下载器置空',index)
self.downloaders[index] = none
#检查线程池状态
for index,downloader in enumerate(self.downloaders):
if not downloader:
qid = self.getqid()
if qid:
#创建线程
t = threading.thread(target=downloadpage,args=(qid,self.db))
self.downloaders[index] = t
t.start()
t.join()
log.log('设置下载器',index)
#间隔一秒执行一次
time.sleep(1)
def getqid(self):
try:
tmp = self.downloadlist[0]
del self.downloadlist[0]
return(tmp)
except:
return(none)
def begindownload(self):
#创建守护线程
daemon = threading.thread(target=self.daemon)
daemon.setdaemon(true)
daemon.start()
daemon.join()
def getdownloader(self):
for index,downloader in enumerate(self.downloaders):
if not downloader:
return(index)
return(none)
add_q_id = 'insert into qiushibaike(id,success) values(?,?)'
update_q_content = 'update qiushibaike set content=? where id=?'
update_q_status = 'update qiushibaike set success=? where id=?'
q_list = 'select id from qiushibaike where success=?'
q_list_by_id = 'select count(*) from qiushibaike where id=?'
class dbconnect(object):
"""
create table qiushibaike(
id,integer
content,varchar
success,interger
)
#id表示糗事的id
#content表示糗事的内容
#success表示是否下载成功,当该糗事内容下载完成,且获得上一页、下一页id时表示下载完成
1表示未完成
2表示完成
"""
def __init__(self,dbpath='db.sqlite'):
self.dbpath = dbpath
def addqid(self,qid):
log.log('插入糗事百科',qid)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
try:
#添加内容并提交
c.execute(add_q_id,(qid,1))
cn.commit()
except:
log.log('添加id出错',qid)
#关闭连接
c.close()
cn.close()
log.log('插入成功')
def updatecontent(self,qid,content):
log.log('更新糗事百科',qid,content)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#添加内容并提交
c.execute(update_q_content,(content,qid))
cn.commit()
#关闭连接
c.close()
cn.close()
log.log('更新成功')
def updatestatus(self,qid,flag):
log.log('更新状态',qid,flag)
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#添加内容并提交
c.execute(update_q_status,(flag,qid))
cn.commit()
#关闭连接
c.close()
cn.close()
log.log('更新状态成功')
def getlist(self,undonloaded=1):
log.log('获得列表')
l = []
#获得连接
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
#获得数据
c.execute(q_list,(undonloaded,))
rows = c.fetchall()
for i in rows:
l.append(i[0])
#关闭连接
c.close()
cn.close()
log.log('获得列表成功')
return(l)
class singledownloader(object):
def __init__(self):
self.downloadlist = []
def setdb(self,db):
self.db = db
def setdownloadlist(self,downloadlist):
self.downloadlist = list(set(self.downloadlist+downloadlist))
def begindownload(self):
for i in self.downloadlist:
downloadpage(i,self.db)
def main():
db = dbconnect('db.sqlite')
#dp = downloaderpool()
#dp.setdb(db)
sp = singledownloader()
sp.setdb(db)
dp=sp
undownloadedlist = db.getlist()
#当还有未下载的糗事时就要继续下载
while(len(undownloadedlist)):
#使用该列表填充下载池
dp.setdownloadlist(undownloadedlist)
dp.begindownload()
time.sleep(1)
#重置参数
undownloadedlist = db.getlist()
if __name__ == '__main__':
main()
代码是没问题的,可以正常运行,但是希望做到以下2方面:
1、多线程下载
2、代码分离度更高,跟面向对象
上一篇: 从Windows系统的本地连接到Linux系统的腾讯云服务器的方法
下一篇: 博客营销之我见