欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫demo_草稿

程序员文章站 2022-07-14 11:17:56
...
# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月18日
@author: XX创意车间
使用python爬取csdn个人博客的访问量,并可视化
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
 
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
 
 
#当前的博客列表页号
now_page = 1
#最后一页列表号
last_page = 2
 
all_url = []
all_id = []

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        # print(self.url)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2]
        # print "title : %s"%title

        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text()
        # print "title:%s"%title
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , title is —— \n《%s》 \n \t page_view is %s \n --------------***----------------"%(self.num,self.title,self.page_view)
 
def getPage(url):
 
    #伪装成浏览器访问,直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()

    return myPage
 
while now_page <= last_page:
    print'-----------------------------the %d page ---------------------------------' % (now_page,)
    
#    获取网页源码     
    myUrl = baseUrl+'/article/list/'+str(now_page)
    myPage = getPage(myUrl)
    
 
    soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
    # <dd title="6047">
    #                 6047            </dd>
    #获取的是属性节点,需要传入方法,然后再获取文本信息
    # https://www.cnblogs.com/cymwill/articles/7574479.html
    total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
    # for t in total_view:
    #     # print "total_view : %s"%total_view
    #     print "t: %s "%t.get_text()
    # total_view = 
    # pattern = re.compile(r'\d+')   # 查找数字
    total_view = total_view[1].get_text().replace(' ','').replace('\n','')
    # total_view = pattern.findall(total_view)
 
    print "total_view : %s"%total_view
 
    idList = re.findall('data-articleid=".*?"',myPage,re.S)
    for id in idList:
        # print("id:",id)
        
        pattern = re.compile(r'\d+')   # 查找数字
        url_id = pattern.findall(id)
        url_id = str(map(int, url_id))
        url_id = url_id[1:-1]   
 
        all_id.append(url_id)
        all_url.append(baseUrl+'/'+url_id)
    
    title = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
 
    titleList=[]
    for items in title:
        titleList.append(str(items).lstrip().rstrip())     
 
 
    #利用正则表达式获取博客的访问量
    # view = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
 
    # viewList=[]
    # for items in view:
    #     viewList.append(str(items).lstrip().rstrip())
 
    #将结果输出   
    for n in range(len(titleList)):
        print('page_view:%s title:%s' % (viewList[n].zfill(4),titleList[n]))
 
    #页号加1
    now_page = now_page + 1
 
i = 1
for id in all_id:
    locals()['blog_'+str(i)] = Blog(i,id)
    i += 1

 

最新的,实验室电脑Ubuntu16编辑的,加上了整个博客数据集类

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量,主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
 
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
 
 
# if sys.stdout.encoding != 'UTF-8':
# 	sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
# if sys.stderr.encoding != 'UTF-8':
# 	sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
# a = "我就是中文啊"
# print(chardet.detect(a))
# print("a:",a)
# print("a:",u'哈哈')
# print("h",u"haha")
 

#最后一页列表号
last_page = 2
 
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,last_page):
        self.blog_list = self.get_blog(last_page)
        self.print_info()


    def get_blog(self,last_page):
        self.all_id,self.total_view = get_all_urls(last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # print("id:",id)
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list
        
    def print_info(self):
        print "all blogs list is "
        
        print self.total_view



def getPage(url):
 
    #伪装成浏览器访问,直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 



def get_all_urls(last_page):
    now_page = 2
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点,需要传入方法,然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1].title

 

修改时间:18年8月18日22点9分,于实验室电脑——

新增功能:同一批次博客数据集类,根据访问量排序,

未实现功能:还不能拿到爬虫的北京时间,还不会存到数据库MySQL中,还不能可视化

 

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月18日22时9分
@author: XX创意车间
使用python爬取csdn个人博客的访问量,并可视化处理
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet

#起始页
now_page = 2
#最后一页列表号
last_page = 2
 

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view



    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)
        self.print_info()
        self.object2dict()


    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def object2dict(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print self.num_list
        print self.title_list
        print self.view_list

        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        
        print self.total_view



def getPage(url):
 
    #伪装成浏览器访问,直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点,需要传入方法,然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]

添加了一个获取年月日时的小代码:

import time
a = time.localtime()
c = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
print c

————'2018_8_19:14'

所以最新整合的代码:

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量,主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

#起始页
now_page = 2
#最后一页列表号
last_page = 2
 

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view



    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()
        self.print_info()
        self.blog_sort()


    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time

    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print len(self.view_list)
        print len(self.title_list)

        for v,t in zip(self.view_list,self.title_list):
            print "----------***----------\n题目: 《%s》\n浏览量: %s"%(t,v)


        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        
        print self.total_view
        print "这批数据的获取时间: %s"%(self.get_time)



def getPage(url):
 
    #伪装成浏览器访问,直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点,需要传入方法,然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]

2018年8月31日21点41更新内容:

加入了数据库处理,需要安装pymysql,和Navicat,手动创建一个CSDN的数据库(database);

然后就可以调用库函数,用程序创建表格和保存数据了,美滋滋~

但是这个格式问题,我花了一两天的时间进行调整,,中文数据的格式化总是有问题,因为每次爬虫都需要一段时间,卡的很,后来只好新建了一个demo程序,找到可以用的模式。

关于数据库中的密码我替换成了:密码  ,自己修改一下就好了~

 

ps:另外,之前遇到的cmd无法输出中文的问题,我直接换成在Spyder中执行程序就好了。编辑和编译一体化美滋滋~

 

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月31日
@author: lyl
使用python爬取csdn个人博客的访问量,并用来数据永久化和可视化,主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

import pymysql



#起始页
now_page = 2
#最后一页列表号
last_page = 2
 
 
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
 
 
class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view
 
    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    
 
 
class Blog_set(object):
    def __init__(self,id,now_page,last_page):
        self.id = id
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()
        self.blog_sort()
        self.create_table()
        
    def insert_blogs(self):
        pass

# 创建表格,这里特地将所有的值都用变量替代了,便于以后的修改
    def create_table(self):
#这一步是创建一个基本的连接,第一个参数是IP地址,本地默认是localhost或者127.0.0.1
#第二个是用户名,一般是'root'用户,
#第三个是密码,你的root用户密码,如果是下载默认的话,一般是'root',最好修改一下
#第四个是数据库名称,你自己创建的一个数据库
#第五个是编码格式,一般用utf8,并且不能加-
这里的密码一定要改成自己的密码!这段不能加注释,必须得报错知道才行!
        db = pymysql.connect("localhost", "root", "密码", "csdn",charset='utf8')
        # 获取cursor光标,固定操作
        cursor = db.cursor()

#原MySQL创建表的命令如下:
#creat table if not exists table_name(id int primary key not null, title tiny text)
#table_name是你需要创建的表名称,括号内的id是第一个表头,int是数据格式;
#primary key是设置主键,not null是表示这列不能为空,逗号前都是限定id这一个表头的。
#title也是一个表头。
#而我用变量将这些提取出来了,格式的变化,让我修改了好久的bug!
#tem代表的是括号内的栏目,需要一个单引号扩住
        tem = '(blogs_id int primary key not null,title tinytext not null,page_view int not null)'
        print tem
#table_id是表名称,也需要是一个字符串,且不能为纯数字!字符串格式的数字也不行!
        table_id = 'table_'+str(self.id)
#在pymysql中,用sql这个变量表示创建表的命令,然后传给函数。
#格式化输入中都是传递的字符串变量
        sql = 'create table if not exists %s %s'%(table_id, tem)
        try:
#努力尝试是否能够执行这句命令
            cursor.execute(sql)
            print 'Sucessfully create table(%s)!'%table_id
        except Exception,e:
            print e
            print 'Fail to create table(%s)'%table_id
        
# 向创建的表格添加数据
#在这里讲讲整个程序的思路:
#1、目前主程序只会产生一个blog_set,也就是博客集的类。
#2、创建好了这个类,就会调用Blog类,拿到我所有博客,并且获取到博客的属性。
#3、具体blog_set类属性:id,get_time,blog_num,blog_list,total_view
#4、Blog属性:排序,专有id(URL生成的那种特异id),title,page_view
#5、在博客集这里,我拿到这些属性后,需要建立这个集的博客表,存下这个博客集的所有博客信息
#6、包括排序、专有id、标题、浏览量

#所以下面就是对博客列表进行依次保存。        
        for blog in self.blog_list:
#            
            print blog.id
            print blog.page_view
            blog.id = int(blog.id)
            print blog.title
#这里需要注意,因为标题又是中文字符串,所以又得注意好格式
#如果不是变量的话,应该是这样的语句:
#title = '\'中文\''
#sql = 'insert into %s (blogs, title, page_view) values (%d, %s, %d)'%(table_name,id,title,page_view)
#或者更直接一点就是:
#sql = 'insert into ta(blogs, title,page_view) values (12,\'我是题目\',123)'
#而我下面无法对blog.title这个变量进行过多的操作,只能链接两个斜杠了
#没想到还真的有用哎,这个我还真的没有百度到别人的操作。

            blog.title = '\''+blog.title+'\''
            print blog.title
#下面只是确保是int类型,其实应该没啥用,本来就是int类型
            blog.page_view = int(blog.page_view)

#插入数据语句,一定要注意栏目(表头)的顺序,对应起来才可以,还有%和\这两个符号不要弄混
#把insert换成replace,没有就插入,有的话,就替换,不会报错美滋滋~
            sql = 'replace into %s (blogs_id,title,page_view) values (%d,%s,%d)'%(table_id,blog.id,blog.title,blog.page_view)
#            sql = 'insert into %s (blogs_id, title, page_view) values (%d, %s, %d)'%(table_id,blog.id,blog.title,blog.page_view)
            # 执行sql语句
            try:
                print "努力插入中..."
                cursor.execute(sql)
                # 提交到数据库执行
                db.commit()
                print 'sucessfully!'
            except Exception, e:
                # 如果发生错误则回滚
                db.rollback()
                print e
                print 'failed!'
    
        # 关闭数据库连接
        db.close()
        
        
 
    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)
 
        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)

        return blog_list
 
    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time
 
    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []
 
        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)
 
        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
        
        print len(self.view_list)
        print len(self.title_list)
        print "ok?"
 
        for v,t,num in zip(self.view_list,self.title_list,self.num_list):
            print "----------***----------\n题目: 《%s》\n浏览量: %s \n第%d篇文章"%(t,v,num)
 
 
        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list
 
    def print_info(self):
        print "本次获取博客数量为:%d"%len(self.blog_list)
        print type(self.total_view)
        print "总阅读量为:%d"%(int(self.total_view))
        print "这批数据的获取时间: %s"%(self.get_time)
 
 
 
def getPage(url):
 
    #伪装成浏览器访问,直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
#    print myResponse.info()
    myPage = myResponse.read()
    
    return myPage
 
 
def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []
 
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)
 
        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点,需要传入方法,然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view
 
        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1
 
    return all_id,total_view
 
 
if __name__ == '__main__':
    blog_set = Blog_set(id = 1,now_page=2,last_page=2)
#    blog_set.print_info()
#    print blog_set.blog_list[-1]

最后来一张运行结果图吧:

爬虫demo_草稿