python实现博客文章爬虫示例
#!/usr/bin/python
#-*-coding:utf-8-*-
# jcrawler
# author: jam <810441377@qq.com>
import time
import urllib2
from bs4 import beautifulsoup
# 目标站点
targethost = "http://adirectory.blog.com"
# user agent
useragent = 'mozilla/5.0 (x11; linux x86_64) applewebkit/537.36 (khtml, like gecko) chrome/33.0.1750.117 safari/537.36'
# 链接采集规则
# 目录链接采集规则
categoryfind = [{'findmode':'find','findtag':'div','rule':{'id':'cat-nav'}},
{'findmode':'findall','findtag':'a','rule':{}}]
# 文章链接采集规则
articlelistfind = [{'findmode':'find','findtag':'div','rule':{'id':'content'}},
{'findmode':'findall','findtag':'h2','rule':{'class':'title'}},
{'findmode':'findall','findtag':'a','rule':{}}]
# 分页url规则
pageurl = 'page/#page/'
pagestart = 1
pagestep = 1
pagestophtml = '404: page not found'
def gethtmltext(url):
request = urllib2.request(url)
request.add_header('accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp")
request.add_header('accept-encoding', "*")
request.add_header('user-agent', useragent)
return urllib2.urlopen(request).read()
def arrtostr(vararr):
returnstr = ""
for s in vararr:
returnstr += str(s)
return returnstr
def gethtmlfind(htmltext, findrule):
findreturn = beautifulsoup(htmltext)
returntext = ""
for f in findrule:
if returntext != "":
findreturn = beautifulsoup(returntext)
if f['findmode'] == 'find':
findreturn = findreturn.find(f['findtag'], f['rule'])
if f['findmode'] == 'findall':
findreturn = findreturn.findall(f['findtag'], f['rule'])
returntext = arrtostr(findreturn)
return findreturn
def getcategory():
categorys = [];
htmltext = gethtmltext(targethost)
findreturn = gethtmlfind(htmltext, categoryfind)
for tag in findreturn:
print "[g]->category:" + tag.string + "|url:" + tag['href']
categorys.append({'name': tag.string, 'url': tag['href']})
return categorys;
def getarticlelist(categoryurl):
articles = []
page = pagestart
#pageurl = pageurl
while true:
htmltext = ""
pageurl = pageurl.replace("#page", str(page))
print "[g]->pageurl:" + categoryurl + pageurl
while true:
try:
htmltext = gethtmltext(categoryurl + pageurl)
break
except urllib2.httperror,e:
print "[e]->http error:" + str(e.code)
if e.code == 404:
htmltext = pagestophtml
break
if e.code == 504:
print "[e]->http error 504: gateway time-out, wait"
time.sleep(5)
else:
break
if htmltext.find(pagestophtml) >= 0:
print "end page."
break
else:
findreturn = gethtmlfind(htmltext, articlelistfind)
for tag in findreturn:
if tag.string != none and tag['href'].find(targethost) >= 0:
print "[g]->article:" + tag.string + "|url:" + tag['href']
articles.append({'name': tag.string, 'url': tag['href']})
page += 1
return articles;
print "[g]->getcategory"
mycategorys = getcategory();
print "[g]->getcategory->success."
time.sleep(3)
for category in mycategorys:
print "[g]->getarticlelist:" + category['name']
getarticlelist(category['url'])
上一篇: python处理中文编码和判断编码示例