爬虫入门:urllib爬虫实例
程序员文章站
2022-05-03 20:01:21
...
豆瓣首页:
import urllib.request
#网址
url = "http://www.douban.com/"
#发起一个请求
request = urllib.request.Request(url)
#爬取结果
response = urllib.request.urlopen(request)
data = response.read()
#设置解码方式
data = data.decode('utf-8')
print(type(response))
print(response.geturl())
print(response.info())
print(response.getcode())
伪装浏览器:
改两行代码,给个请求头:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
# 'Chrome/51.0.2704.63 Safari/537.36'}
req = urllib.request.Request(url=url,headers=headers)
下载豆瓣图片:
'''''
批量下载豆瓣首页的图片
采用伪装浏览器的方式爬取豆瓣网站首页的图片,保存到指定路径文件夹下
'''
#导入所需的库
import urllib.request,socket,re,sys,os
#定义文件保存路径
targetPath = "E:\\workspace\\pachong\\images"
def saveFile(path):
#检测当前路径的有效性 (是否为目录)
if not os.path.isdir(targetPath):
os.mkdir(targetPath)
#设置每个图片的路径
#rindex为最后出现的位置
pos = path.rindex('/')
#拼接路径
t = os.path.join(targetPath,path[pos+1:])
return t
#网址
url = "https://www.douban.com/"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
data = res.read()
for link,t in set(re.findall(r'(https:[^s]*?(jpg|png|gif))', str(data))):
print(link)
try:
#将链接的内容下载到本地
urllib.request.urlretrieve(link,saveFile(link))
except:
print('失败')
爬我的博客:
import urllib.request,re,time,random,gzip,sys,os
targetPath = "E:\\workspace\\pachong\\content"
def saveContent(path):
#检测当前路径的有效性 (是否为目录)
if not os.path.isdir(targetPath):
os.mkdir(targetPath)
#设置每个图片的路径
#rindex为最后出现的位置
pos = path.rindex('/')
#拼接路径
t = os.path.join(targetPath,path[pos+1:])
return t
#定义保存文件函数
def saveFile(data,i):
path = "E:\\workspace\\pachong\\data\\paper_"+str(i+1)+".txt"
file = open(path,'wb')
page = '当前页:'+str(i+1)+'\n'
file.write(page.encode('gbk'))
#将博文信息写入文件(以utf-8保存的文件声明为gbk)
for d in data:
d = str(d)+'\n'
file.write(d.encode('gbk'))
file.close()
#解压缩数据
def ungzip(data):
try:
#print("正在解压缩...")
data = gzip.decompress(data)
#print("解压完毕...")
except:
print("未经压缩,无需解压...")
return data
#CSDN爬虫类
class CSDNSpider:
def __init__(self,pageIdx=1,url="http://blog.csdn.net/lk7688535/article/list/1"):
#默认当前页
self.pageIdx = pageIdx
self.url = url[0:url.rfind('/') + 1] + str(pageIdx)
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3368.400 QQBrowser/9.6.11974.400",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Host": "blog.csdn.net"
}
#求总页数
def getPages(self):
req = urllib.request.Request(url=self.url, headers=self.headers)
res = urllib.request.urlopen(req)
#从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
data = res.read()
data = ungzip(data)
data = data.decode('utf-8')
pages = r'<div.*?pagelist">.*?<span>.*?共(.*?)页</span>'
# 计算我的博文总页数
pattern = re.compile(pages, re.DOTALL)
pagesNum = re.findall(pattern, data)[0]
return pagesNum
#设置要抓取的博文页面
def setPage(self,idx):
self.url = self.url[0:self.url.rfind('/')+1]+str(idx)
#读取博文信息
def readData(self):
ret=[]
str = r'<div.*?list_item article_item">.*?link_title"><a href="(.*?)">(.*?)</a>'
req = urllib.request.Request(url=self.url, headers=self.headers)
res = urllib.request.urlopen(req)
# 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
data = res.read()
data = ungzip(data)
data = data.decode('utf-8')
pattern = re.compile(str,re.DOTALL)
items = re.findall(pattern,data)
for item in items:
print("链接"+item[0])
ret.append('题目:'+item[1]+'链接:'+item[0]+'\n')
link = "http://blog.csdn.net/"+item[0]
urllib.request.urlretrieve(link,saveContent(link))
return ret
#定义爬虫对象
print("0")
cs = CSDNSpider()
#求取
print("1")
pagesNum = int(cs.getPages())
print("博文总页数: ",pagesNum)
for idx in range(pagesNum):
cs.setPage(idx)
print("当前页:",idx+1)
#读取当前页的所有博文,结果为list类型
papers = cs.readData()
saveFile(papers,idx)