爬虫demo_草稿

程序员文章站 2022-07-14 11:17:56

...

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月18日
@author: XX创意车间
使用python爬取csdn个人博客的访问量，并可视化
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
 
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
 
 
#当前的博客列表页号
now_page = 1
#最后一页列表号
last_page = 2
 
all_url = []
all_id = []

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        # print(self.url)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2]
        # print "title : %s"%title

        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text()
        # print "title:%s"%title
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数：(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , title is —— \n《%s》 \n \t page_view is %s \n --------------***----------------"%(self.num,self.title,self.page_view)
 
def getPage(url):
 
    #伪装成浏览器访问，直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()

    return myPage
 
while now_page <= last_page:
    print'-----------------------------the %d page ---------------------------------' % (now_page,)
    
#    获取网页源码     
    myUrl = baseUrl+'/article/list/'+str(now_page)
    myPage = getPage(myUrl)
    
 
    soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
    # <dd title="6047">
    #                 6047            </dd>
    #获取的是属性节点，需要传入方法，然后再获取文本信息
    # https://www.cnblogs.com/cymwill/articles/7574479.html
    total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
    # for t in total_view:
    #     # print "total_view : %s"%total_view
    #     print "t: %s "%t.get_text()
    # total_view = 
    # pattern = re.compile(r'\d+')   # 查找数字
    total_view = total_view[1].get_text().replace(' ','').replace('\n','')
    # total_view = pattern.findall(total_view)
 
    print "total_view : %s"%total_view
 
    idList = re.findall('data-articleid=".*?"',myPage,re.S)
    for id in idList:
        # print("id:",id)
        
        pattern = re.compile(r'\d+')   # 查找数字
        url_id = pattern.findall(id)
        url_id = str(map(int, url_id))
        url_id = url_id[1:-1]   
 
        all_id.append(url_id)
        all_url.append(baseUrl+'/'+url_id)
    
    title = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
 
    titleList=[]
    for items in title:
        titleList.append(str(items).lstrip().rstrip())     
 
 
    #利用正则表达式获取博客的访问量
    # view = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
 
    # viewList=[]
    # for items in view:
    #     viewList.append(str(items).lstrip().rstrip())
 
    #将结果输出   
    for n in range(len(titleList)):
        print('page_view:%s title:%s' % (viewList[n].zfill(4),titleList[n]))
 
    #页号加1
    now_page = now_page + 1
 
i = 1
for id in all_id:
    locals()['blog_'+str(i)] = Blog(i,id)
    i += 1

最新的，实验室电脑Ubuntu16编辑的，加上了整个博客数据集类

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量，主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
 
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
 
 
# if sys.stdout.encoding != 'UTF-8':
# 	sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
# if sys.stderr.encoding != 'UTF-8':
# 	sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
# a = "我就是中文啊"
# print(chardet.detect(a))
# print("a:",a)
# print("a:",u'哈哈')
# print("h",u"haha")
 

#最后一页列表号
last_page = 2
 
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数：(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,last_page):
        self.blog_list = self.get_blog(last_page)
        self.print_info()


    def get_blog(self,last_page):
        self.all_id,self.total_view = get_all_urls(last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # print("id:",id)
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list
        
    def print_info(self):
        print "all blogs list is "
        
        print self.total_view



def getPage(url):
 
    #伪装成浏览器访问，直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 



def get_all_urls(last_page):
    now_page = 2
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点，需要传入方法，然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1].title

修改时间：18年8月18日22点9分，于实验室电脑——

新增功能：同一批次博客数据集类，根据访问量排序，

未实现功能：还不能拿到爬虫的北京时间，还不会存到数据库MySQL中，还不能可视化

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月18日22时9分
@author: XX创意车间
使用python爬取csdn个人博客的访问量，并可视化处理
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet

#起始页
now_page = 2
#最后一页列表号
last_page = 2
 

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数：(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view



    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)
        self.print_info()
        self.object2dict()


    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名，以这个变量名建立一个blog对象
            # tem将变量地址拿到，然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def object2dict(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print self.num_list
        print self.title_list
        print self.view_list

        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        
        print self.total_view



def getPage(url):
 
    #伪装成浏览器访问，直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点，需要传入方法，然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]

添加了一个获取年月日时的小代码：

import time
a = time.localtime()
c = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
print c

————'2018_8_19:14'

所以最新整合的代码：

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量，主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

#起始页
now_page = 2
#最后一页列表号
last_page = 2
 

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account


class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数：(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view



    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    


class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()
        self.print_info()
        self.blog_sort()


    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名，以这个变量名建立一个blog对象
            # tem将变量地址拿到，然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time

    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print len(self.view_list)
        print len(self.title_list)

        for v,t in zip(self.view_list,self.title_list):
            print "----------***----------\n题目： 《%s》\n浏览量: %s"%(t,v)


        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        
        print self.total_view
        print "这批数据的获取时间: %s"%(self.get_time)



def getPage(url):
 
    #伪装成浏览器访问，直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    
    return myPage
 

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点，需要传入方法，然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1

    return all_id,total_view
 

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]

2018年8月31日21点41更新内容：

加入了数据库处理，需要安装pymysql，和Navicat，手动创建一个CSDN的数据库（database）；

然后就可以调用库函数，用程序创建表格和保存数据了，美滋滋~

但是这个格式问题，我花了一两天的时间进行调整，，中文数据的格式化总是有问题，因为每次爬虫都需要一段时间，卡的很，后来只好新建了一个demo程序，找到可以用的模式。

关于数据库中的密码我替换成了：密码，自己修改一下就好了~

ps:另外，之前遇到的cmd无法输出中文的问题，我直接换成在Spyder中执行程序就好了。编辑和编译一体化美滋滋~

# coding:utf-8
#!/usr/bin/env python
 
'''
Created on 2018年8月31日
@author: lyl
使用python爬取csdn个人博客的访问量，并用来数据永久化和可视化，主要用来练手
'''
 
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

import pymysql



#起始页
now_page = 2
#最后一页列表号
last_page = 2
 
 
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
 
 
class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
        self.print_info()
 
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
 
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
 
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        
        return title
 
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数：(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view
 
    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    
 
 
class Blog_set(object):
    def __init__(self,id,now_page,last_page):
        self.id = id
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()
        self.blog_sort()
        self.create_table()
        
    def insert_blogs(self):
        pass

# 创建表格，这里特地将所有的值都用变量替代了，便于以后的修改
    def create_table(self):
#这一步是创建一个基本的连接，第一个参数是IP地址，本地默认是localhost或者127.0.0.1
#第二个是用户名，一般是'root'用户，
#第三个是密码，你的root用户密码，如果是下载默认的话，一般是'root'，最好修改一下
#第四个是数据库名称，你自己创建的一个数据库
#第五个是编码格式，一般用utf8,并且不能加-
这里的密码一定要改成自己的密码！这段不能加注释，必须得报错知道才行！
        db = pymysql.connect("localhost", "root", "密码", "csdn",charset='utf8')
        # 获取cursor光标，固定操作
        cursor = db.cursor()

#原MySQL创建表的命令如下：
#creat table if not exists table_name(id int primary key not null, title tiny text)
#table_name是你需要创建的表名称，括号内的id是第一个表头，int是数据格式；
#primary key是设置主键，not null是表示这列不能为空，逗号前都是限定id这一个表头的。
#title也是一个表头。
#而我用变量将这些提取出来了，格式的变化，让我修改了好久的bug！
#tem代表的是括号内的栏目，需要一个单引号扩住
        tem = '(blogs_id int primary key not null,title tinytext not null,page_view int not null)'
        print tem
#table_id是表名称，也需要是一个字符串，且不能为纯数字！字符串格式的数字也不行！
        table_id = 'table_'+str(self.id)
#在pymysql中，用sql这个变量表示创建表的命令，然后传给函数。
#格式化输入中都是传递的字符串变量
        sql = 'create table if not exists %s %s'%(table_id, tem)
        try:
#努力尝试是否能够执行这句命令
            cursor.execute(sql)
            print 'Sucessfully create table(%s)!'%table_id
        except Exception,e:
            print e
            print 'Fail to create table(%s)'%table_id
        
# 向创建的表格添加数据
#在这里讲讲整个程序的思路：
#1、目前主程序只会产生一个blog_set,也就是博客集的类。
#2、创建好了这个类，就会调用Blog类，拿到我所有博客，并且获取到博客的属性。
#3、具体blog_set类属性：id,get_time,blog_num,blog_list，total_view
#4、Blog属性：排序，专有id（URL生成的那种特异id），title，page_view
#5、在博客集这里，我拿到这些属性后，需要建立这个集的博客表，存下这个博客集的所有博客信息
#6、包括排序、专有id、标题、浏览量

#所以下面就是对博客列表进行依次保存。        
        for blog in self.blog_list:
#            
            print blog.id
            print blog.page_view
            blog.id = int(blog.id)
            print blog.title
#这里需要注意，因为标题又是中文字符串，所以又得注意好格式
#如果不是变量的话，应该是这样的语句：
#title = '\'中文\''
#sql = 'insert into %s (blogs, title, page_view) values (%d, %s, %d)'%(table_name,id,title,page_view)
#或者更直接一点就是：
#sql = 'insert into ta(blogs, title,page_view) values (12,\'我是题目\',123)'
#而我下面无法对blog.title这个变量进行过多的操作，只能链接两个斜杠了
#没想到还真的有用哎，这个我还真的没有百度到别人的操作。

            blog.title = '\''+blog.title+'\''
            print blog.title
#下面只是确保是int类型，其实应该没啥用，本来就是int类型
            blog.page_view = int(blog.page_view)

#插入数据语句，一定要注意栏目（表头）的顺序，对应起来才可以，还有%和\这两个符号不要弄混
#把insert换成replace，没有就插入，有的话，就替换，不会报错美滋滋~
            sql = 'replace into %s (blogs_id,title,page_view) values (%d,%s,%d)'%(table_id,blog.id,blog.title,blog.page_view)
#            sql = 'insert into %s (blogs_id, title, page_view) values (%d, %s, %d)'%(table_id,blog.id,blog.title,blog.page_view)
            # 执行sql语句
            try:
                print "努力插入中..."
                cursor.execute(sql)
                # 提交到数据库执行
                db.commit()
                print 'sucessfully!'
            except Exception, e:
                # 如果发生错误则回滚
                db.rollback()
                print e
                print 'failed!'
    
        # 关闭数据库连接
        db.close()
        
        
 
    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)
 
        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名，以这个变量名建立一个blog对象
            # tem将变量地址拿到，然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            blog_list.append(tem)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)

        return blog_list
 
    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time
 
    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []
 
        for blog_address in self.blog_list:
            self.num_list.append(blog_address.num)
            self.title_list.append(blog_address.title)
            self.view_list.append(blog_address.page_view)
 
        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
        
        print len(self.view_list)
        print len(self.title_list)
        print "ok?"
 
        for v,t,num in zip(self.view_list,self.title_list,self.num_list):
            print "----------***----------\n题目： 《%s》\n浏览量: %s \n第%d篇文章"%(t,v,num)
 
 
        # blog_dict = fromkeys()
        
    # def blog_sort(self):
    #     self.blog_list
 
    def print_info(self):
        print "本次获取博客数量为：%d"%len(self.blog_list)
        print type(self.total_view)
        print "总阅读量为：%d"%(int(self.total_view))
        print "这批数据的获取时间: %s"%(self.get_time)
 
 
 
def getPage(url):
 
    #伪装成浏览器访问，直接访问的话csdn会拒绝
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    #构造请求
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    #访问页面
    myResponse = urllib2.urlopen(req)
#    print myResponse.info()
    myPage = myResponse.read()
    
    return myPage
 
 
def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []
 
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
        
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)
 
        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        #获取的是属性节点，需要传入方法，然后再获取文本信息
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view
 
        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        
        for id in idList:
            # print("id:",id)
            
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
     
            all_id.append(url_id)
            all_url.append(baseUrl+'/'+url_id)
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        
        
        now_page = now_page + 1
 
    return all_id,total_view
 
 
if __name__ == '__main__':
    blog_set = Blog_set(id = 1,now_page=2,last_page=2)
#    blog_set.print_info()
#    print blog_set.blog_list[-1]

最后来一张运行结果图吧：

爬虫demo_草稿

相关标签： Python CSDN 爬虫教程笔记

上一篇： openCV：霍夫直线变换、霍夫圆检测

下一篇：【Python第六课】爬虫实战进阶

爬虫demo_草稿

python爬虫利用代理池更换IP的方法步骤

【Python3爬虫】Scrapy+MongoDB+MySQL

Docker部署Python爬虫项目的方法步骤

Python制作简单的网页爬虫

JAVA爬虫实现自动登录淘宝

浙江高职高校网站爬虫实例

python爬虫发送post登录请求的两种方式

Python爬虫实现（伪）球迷速成

详解用python写网络爬虫-爬取新浪微博评论

Python实现的文轩网爬虫完整示例