爬虫学习
程序员文章站
2022-03-02 19:32:37
...
抓取糗事百科
#!/usr/bin/python
# -*- coding:utf-8 -*-
import sys
import urllib
import urllib2
import re
import cPickle as P
reload(sys)
sys.setdefaultencoding('utf-8')
myFile = file("qiubai.txt","wb+")
#myFile = file("qiubai.txt","a")
page = 1
count = 0
temp = ''
url = "http://www.qiushibaike.com/hot/page/"+str(page)
headers = {"User-Agent":"Mozzila/4.0(compatible;MSIE 5.5;Windows NT)"}
try:
req = urllib2.Request(url,headers=headers)
resp = urllib2.urlopen(req)
content = resp.read().encode('gbk')
patterns = re.compile('<div.*?class="author.*".*?>n<a.*?>n<(.*?)>n</a>n.*n<h2>(.*)</h2>n.*n.*n{3}<div.*>n{2}(.*)n.*n{2}.*n{4}.*')
items = re.findall(patterns,content)
#print 'hello'
for item in items:
count = count + 1
temp += '('+str(count)+')'+str(item[1])+'n'+str(item[2])+'n'+'n'
#print temp
print temp
P.dump(temp,myFile)
except urllib2.URLError,e:
if hasattr(e,'code'):
print e.code
if hasattr(e,'reason'):
print e.reason
finally:
myFile.close()
myFile = open('qiubai.txt','rb')
#for i in myFile.readlines():
# print i
content = P.load(myFile)
print content
myFile.close()
下一篇: 爬虫学习
发表评论 取消回复
电子邮件地址不会被公开。 必填项已用*标注
姓名 *
电子邮件 *
站点
评论
您可以使用这些HTML标签和属性:
<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>