网络爬虫
程序员文章站
2022-07-05 13:01:29
...
# -*- coding: utf-8 -*-
#from HTMLParaser import HTMLParaser
import re
import urllib2
import urllib
import os
PdfPattern = re.compile(r'.')
top_level_url = r"http://linux.linuxidc.com/"
def findDir(url):
url_decode = urllib.unquote(url)
url_list = url_decode.split('/')
next_dir = url_list[-2]
if url_list[-1]:
return
else:
if os.path.exists(next_dir):
print next_dir + '目录已存在.'
else:
print '目录不存在,创建目录' + next_dir
os.mkdir(next_dir)
# 进入新目录
os.chdir(next_dir)
#print '[当前目录-1]:' + os.getcwd()
return
def findURL(pre_url, url):
#print '[p-url]:' + pre_url
#print '[url ]:' + url
# 分割
curr_url = ''
next_url = ''
url_split = url.split("/")
#print url_split
if url_split[-1]:
''' 字符串不为空说明当前地址是一个文件,则下载文件,为空说明当前地址是一个目录,则遍历这个目录'''
# filename
filename = urllib.unquote(url_split[-1][:-4])
#print filename
''' 将网下下载下来分析...'''
if os.path.exists(filename):
print "文件已存在..."
else:
print "文件不存在,下载文件..."
print "网址:" + url + '\n'
try: f = urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'The server could\'t fulfill the request.'
print 'Error code:', e.code
return
except urllib2.URLError, e:
print 'We failed to reach a server.'
print 'Reason:', e.reason
return
else:
data = f.read()
with open(filename, "wb") as code:
code.write(data)
f.close()
return
else:
''' 将网页下载下来 ‘'''
print "创建目录..."
UrlsPattern = re.compile(r'<A.*?>')
request = urllib2.Request(url)
try: html = urllib2.urlopen(request, timeout = 10).read()
except urllib2.HTTPError, e:
print 'The server could\'t fulfill the request.'
os.chdir('../')
print 'Error code:', e.code
except urllib2.URLError, e:
print 'We failed to reach a server.'
print 'Reason:', e.reason
else:
curr_url = request.get_full_url()
#print '[当前网址]' + curr_url
# 查找链接,取出网页中所有的链接
urls = UrlsPattern.findall(html)
if urls:
for item in urls:
next_url = top_level_url + item[10:-2]
if next_url == pre_url:
print "网页已经浏览..."
pass
else:
findDir(next_url)
findURL(curr_url, next_url)
print "返回上一层目录..."
os.chdir('../')
#print '[当前目录-2]:' + os.getcwd()
else:
pass
return
#url = r"http://linux.linuxidc.com/2012%E5%B9%B4%E8%B5%84%E6%96%99/"
username = "www.linuxidc.com"
password = "www.linuxidc.com"
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, top_level_url, username, password)
handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(urllib2.HTTPHandler, handler)
urllib2.install_opener(opener)
findURL("", top_level_url)
转载于:https://my.oschina.net/xushizhe/blog/338956