python抓取京东商城手机列表url实例代码
#-*- coding: utf-8 -*-
'''
created on 2013-12-5
@author: good-temper
'''
import urllib2
import bs4
import time
def getpage(urlstr):
'''
获取页面内容
'''
content = urllib2.urlopen(urlstr).read()
return content
def getnextpageurl(currpagenum):
#http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-页码-1-1-72-4137-33.html
url = u'http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-'+str(currpagenum+1)+'-1-1-72-4137-33.html'
#是否有下一页
content = getpage(url);
soup = bs4.beautifulsoup(content)
list = soup.findall('span',{'class':'next-disabled'});
if(len(list) == 0):
return url
return ''
def analyzelist():
pagenum = 0
list = []
url = getnextpageurl(pagenum)
while url !='':
soup = bs4.beautifulsoup(getpage(url))
pagelist = soup.findall('div',{'class':'p-name'})
for elem in pagelist:
soup1 = bs4.beautifulsoup(str(elem))
list.append(soup1.find('a')['href'])
pagenum = pagenum+1
print pagenum
url = getnextpageurl(pagenum)
return list
def analyzecontent(url):
return ''
def writetofile(list, path):
f = open(path, 'a')
for elem in list:
f.write(elem+'\n')
f.close()
if __name__ == '__main__':
list = analyzelist()
print '共抓取'+str(len(list))+'条\n'
writetofile(list, u'e:\\jd_phone_list.dat');