欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫入门:urllib爬虫实例

程序员文章站 2022-05-03 20:01:21
...

豆瓣首页:

import urllib.request  

#网址  
url = "http://www.douban.com/"  

#发起一个请求  
request = urllib.request.Request(url)  

#爬取结果  
response = urllib.request.urlopen(request)  

data = response.read()  

#设置解码方式  
data = data.decode('utf-8') 

print(type(response))  
print(response.geturl())  
print(response.info())  
print(response.getcode()) 

伪装浏览器:

改两行代码,给个请求头:

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}  
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '  
#                        'Chrome/51.0.2704.63 Safari/537.36'} 

req = urllib.request.Request(url=url,headers=headers) 

下载豆瓣图片:

''''' 
批量下载豆瓣首页的图片 

采用伪装浏览器的方式爬取豆瓣网站首页的图片,保存到指定路径文件夹下 
'''  

#导入所需的库  
import urllib.request,socket,re,sys,os  

#定义文件保存路径  
targetPath = "E:\\workspace\\pachong\\images"  

def saveFile(path):  
    #检测当前路径的有效性  (是否为目录)
    if not os.path.isdir(targetPath):  
        os.mkdir(targetPath)  

    #设置每个图片的路径  
    #rindex为最后出现的位置
    pos = path.rindex('/')  
    #拼接路径
    t = os.path.join(targetPath,path[pos+1:])  
    return t  

#网址  
url = "https://www.douban.com/"  
headers = {  
              'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'  
           }  

req = urllib.request.Request(url=url, headers=headers)  

res = urllib.request.urlopen(req)  

data = res.read()  

for link,t in set(re.findall(r'(https:[^s]*?(jpg|png|gif))', str(data))):  

    print(link)  
    try:  
        #将链接的内容下载到本地
        urllib.request.urlretrieve(link,saveFile(link))  
    except:  
        print('失败')  

爬我的博客:

import urllib.request,re,time,random,gzip,sys,os  

targetPath = "E:\\workspace\\pachong\\content"  

def saveContent(path):  
    #检测当前路径的有效性  (是否为目录)
    if not os.path.isdir(targetPath):  
        os.mkdir(targetPath)  

    #设置每个图片的路径  
    #rindex为最后出现的位置
    pos = path.rindex('/')  
    #拼接路径
    t = os.path.join(targetPath,path[pos+1:])  
    return t   

#定义保存文件函数  
def saveFile(data,i):  
    path = "E:\\workspace\\pachong\\data\\paper_"+str(i+1)+".txt"  
    file = open(path,'wb')  
    page = '当前页:'+str(i+1)+'\n'  
    file.write(page.encode('gbk'))  
    #将博文信息写入文件(以utf-8保存的文件声明为gbk)  
    for d in data:  
        d = str(d)+'\n'  
        file.write(d.encode('gbk'))  
    file.close()  

#解压缩数据  
def ungzip(data):  
    try:  
        #print("正在解压缩...")  
        data = gzip.decompress(data)  
        #print("解压完毕...")  
    except:  
        print("未经压缩,无需解压...")  
    return data  

#CSDN爬虫类  
class CSDNSpider:  
    def __init__(self,pageIdx=1,url="http://blog.csdn.net/lk7688535/article/list/1"):  
        #默认当前页  
        self.pageIdx = pageIdx  
        self.url = url[0:url.rfind('/') + 1] + str(pageIdx)  
        self.headers = {  
            "Connection": "keep-alive",  
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3368.400 QQBrowser/9.6.11974.400",  
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",  
            "Accept-Encoding": "gzip, deflate, sdch",  
            "Accept-Language": "zh-CN,zh;q=0.8",  
            "Host": "blog.csdn.net"  
        }  

    #求总页数  
    def getPages(self):  
        req = urllib.request.Request(url=self.url, headers=self.headers)  
        res = urllib.request.urlopen(req)  

        #从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode('utf-8')  

        pages = r'<div.*?pagelist">.*?<span>.*?共(.*?)页</span>'  

        # 计算我的博文总页数  
        pattern = re.compile(pages, re.DOTALL)  
        pagesNum = re.findall(pattern, data)[0]  
        return pagesNum  

    #设置要抓取的博文页面  
    def setPage(self,idx):  
        self.url = self.url[0:self.url.rfind('/')+1]+str(idx)  

    #读取博文信息  
    def readData(self):  
        ret=[]  
        str = r'<div.*?list_item article_item">.*?link_title"><a href="(.*?)">(.*?)</a>'
        req = urllib.request.Request(url=self.url, headers=self.headers)  
        res = urllib.request.urlopen(req)  

        # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode('utf-8')  
        pattern = re.compile(str,re.DOTALL)  
        items = re.findall(pattern,data)  
        for item in items: 
            print("链接"+item[0]) 
            ret.append('题目:'+item[1]+'链接:'+item[0]+'\n') 
            link = "http://blog.csdn.net/"+item[0]
            urllib.request.urlretrieve(link,saveContent(link))   
        return ret  

#定义爬虫对象  
print("0")
cs = CSDNSpider()  
#求取  
print("1")
pagesNum = int(cs.getPages())  
print("博文总页数: ",pagesNum)  

for idx in range(pagesNum):  
    cs.setPage(idx)  
    print("当前页:",idx+1)  
    #读取当前页的所有博文,结果为list类型  
    papers = cs.readData()  
    saveFile(papers,idx)  

实例借鉴:
http://blog.csdn.net/fly_yr/article/details/51535676