您现在的位置是: 首页


程序员文章站 2022-07-14 11:17:56
# coding:utf-8
#!/usr/bin/env python
Created on 2018年8月18日
@author: XX创意车间
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
type = sys.getfilesystemencoding()
now_page = 1
last_page = 2
all_url = []
all_id = []

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account

class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        # print(self.url)
        return getPage(self.url)
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2]
        # print "title : %s"%title

        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text()
        # print "title:%s"%title
        return title
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , title is —— \n《%s》 \n \t page_view is %s \n --------------***----------------"%(self.num,self.title,self.page_view)
def getPage(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()

    return myPage
while now_page <= last_page:
    print'-----------------------------the %d page ---------------------------------' % (now_page,)
#    获取网页源码     
    myUrl = baseUrl+'/article/list/'+str(now_page)
    myPage = getPage(myUrl)
    soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
    # <dd title="6047">
    #                 6047            </dd>
    # https://www.cnblogs.com/cymwill/articles/7574479.html
    total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
    # for t in total_view:
    #     # print "total_view : %s"%total_view
    #     print "t: %s "%t.get_text()
    # total_view = 
    # pattern = re.compile(r'\d+')   # 查找数字
    total_view = total_view[1].get_text().replace(' ','').replace('\n','')
    # total_view = pattern.findall(total_view)
    print "total_view : %s"%total_view
    idList = re.findall('data-articleid=".*?"',myPage,re.S)
    for id in idList:
        # print("id:",id)
        pattern = re.compile(r'\d+')   # 查找数字
        url_id = pattern.findall(id)
        url_id = str(map(int, url_id))
        url_id = url_id[1:-1]   
    title = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
    for items in title:
    # view = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
    # viewList=[]
    # for items in view:
    #     viewList.append(str(items).lstrip().rstrip())
    for n in range(len(titleList)):
        print('page_view:%s title:%s' % (viewList[n].zfill(4),titleList[n]))
    now_page = now_page + 1
i = 1
for id in all_id:
    locals()['blog_'+str(i)] = Blog(i,id)
    i += 1



# coding:utf-8
#!/usr/bin/env python
Created on 2016年2月13日
@author: xingjiarong
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
type = sys.getfilesystemencoding()
# if sys.stdout.encoding != 'UTF-8':
# 	sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
# if sys.stderr.encoding != 'UTF-8':
# 	sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
# a = "我就是中文啊"
# print(chardet.detect(a))
# print("a:",a)
# print("a:",u'哈哈')
# print("h",u"haha")

last_page = 2
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account

class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        return title
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    

class Blog_set(object):
    def __init__(self,last_page):
        self.blog_list = self.get_blog(last_page)

    def get_blog(self,last_page):
        self.all_id,self.total_view = get_all_urls(last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # print("id:",id)
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list
    def print_info(self):
        print "all blogs list is "
        print self.total_view

def getPage(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    return myPage

def get_all_urls(last_page):
    now_page = 2
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        for id in idList:
            # print("id:",id)
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        now_page = now_page + 1

    return all_id,total_view

if __name__ == '__main__':
    blog_set = Blog_set(last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1].title






# coding:utf-8
#!/usr/bin/env python
Created on 2018年8月18日22时9分
@author: XX创意车间
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet

now_page = 2
last_page = 2

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account

class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        return title
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    

class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)

    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def object2dict(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print self.num_list
        print self.title_list
        print self.view_list

        # blog_dict = fromkeys()
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        print self.total_view

def getPage(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    return myPage

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        for id in idList:
            # print("id:",id)
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        now_page = now_page + 1

    return all_id,total_view

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]


import time
a = time.localtime()
c = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
print c



# coding:utf-8
#!/usr/bin/env python
Created on 2016年2月13日
@author: xingjiarong
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

now_page = 2
last_page = 2

account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account

class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        return title
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view

    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    

class Blog_set(object):
    def __init__(self,now_page,last_page):
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()

    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)

        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)
        return blog_list

    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time

    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []

        for blog_address in self.blog_list:

        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]

        print len(self.view_list)
        print len(self.title_list)

        for v,t in zip(self.view_list,self.title_list):
            print "----------***----------\n题目: 《%s》\n浏览量: %s"%(t,v)

        # blog_dict = fromkeys()
    # def blog_sort(self):
    #     self.blog_list

    def print_info(self):
        print "the number of blogs is %d"%len(self.blog_list)
        print self.total_view
        print "这批数据的获取时间: %s"%(self.get_time)

def getPage(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    myResponse = urllib2.urlopen(req)
    myPage = myResponse.read()
    return myPage

def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []

    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)

        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view

        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        for id in idList:
            # print("id:",id)
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        now_page = now_page + 1

    return all_id,total_view

if __name__ == '__main__':
    blog_set = Blog_set(now_page,last_page)
    # blog_set.print_info()
    print blog_set.blog_list[-1]





关于数据库中的密码我替换成了:密码  ,自己修改一下就好了~




# coding:utf-8
#!/usr/bin/env python
Created on 2018年8月31日
@author: lyl
import urllib2
import re
from bs4 import BeautifulSoup 
import sys
import chardet
import time

import pymysql

now_page = 2
last_page = 2
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
    def __init__(self,num,id):
        self.id = id
        self.num = num
        self.page = self.get_page()
        self.title = self.get_title()
        self.page_view = self.get_page_view()
    def get_page(self):
        self.url = baseUrl+'/article/details/'+str(self.id)
        return getPage(self.url)
    def get_title(self):
        # pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>') 
        # title = pattern.findall(self.page)
        # title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
        # title = title.encode('unicode_escape').decode('string_escape')
        # # print(chardet.detect(title))
        # print("title:",title)
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        title = soup.find('h1',class_ = 'title-article')
        title = title.get_text().encode('utf-8')
        # print(chardet.detect(title))
        return title
    def get_page_view(self):
        soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
        pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>')   # 查找数字
        # print(self.page)
        page_view = pattern.findall(self.page)
        page_view = str(page_view)[2:-2]
        # title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
        # print("the %d blog page_view is %s"%(self.num,page_view))
        return page_view
    def print_info(self):
        print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)    
class Blog_set(object):
    def __init__(self,id,now_page,last_page):
        self.id = id
        self.blog_list = self.get_blog(now_page,last_page)
        self.time = self.get_time()
    def insert_blogs(self):

# 创建表格,这里特地将所有的值都用变量替代了,便于以后的修改
    def create_table(self):
        db = pymysql.connect("localhost", "root", "密码", "csdn",charset='utf8')
        # 获取cursor光标,固定操作
        cursor = db.cursor()

#creat table if not exists table_name(id int primary key not null, title tiny text)
#primary key是设置主键,not null是表示这列不能为空,逗号前都是限定id这一个表头的。
        tem = '(blogs_id int primary key not null,title tinytext not null,page_view int not null)'
        print tem
        table_id = 'table_'+str(self.id)
        sql = 'create table if not exists %s %s'%(table_id, tem)
            print 'Sucessfully create table(%s)!'%table_id
        except Exception,e:
            print e
            print 'Fail to create table(%s)'%table_id
# 向创建的表格添加数据

        for blog in self.blog_list:
            print blog.id
            print blog.page_view
            blog.id = int(blog.id)
            print blog.title
#title = '\'中文\''
#sql = 'insert into %s (blogs, title, page_view) values (%d, %s, %d)'%(table_name,id,title,page_view)
#sql = 'insert into ta(blogs, title,page_view) values (12,\'我是题目\',123)'

            blog.title = '\''+blog.title+'\''
            print blog.title
            blog.page_view = int(blog.page_view)

            sql = 'replace into %s (blogs_id,title,page_view) values (%d,%s,%d)'%(table_id,blog.id,blog.title,blog.page_view)
#            sql = 'insert into %s (blogs_id, title, page_view) values (%d, %s, %d)'%(table_id,blog.id,blog.title,blog.page_view)
            # 执行sql语句
                print "努力插入中..."
                # 提交到数据库执行
                print 'sucessfully!'
            except Exception, e:
                # 如果发生错误则回滚
                print e
                print 'failed!'
        # 关闭数据库连接
    def get_blog(self,now_page,last_page):
        self.all_id,self.total_view = get_all_urls(now_page,last_page)
        i = 1
        blog_list = []
        for id in self.all_id:
            # local()可以将字符串变为变量名,以这个变量名建立一个blog对象
            # tem将变量地址拿到,然后存到列表中
            tem = locals()['blog_'+str(i)] = Blog(i,id)
            i += 1
        print "so this time the total page_view is %s"%(self.total_view)

        return blog_list
    def get_time(self):
        a = time.localtime()
        self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
        return self.get_time
    def blog_sort(self):
        self.num_list = []
        self.title_list = []
        self.view_list = []
        for blog_address in self.blog_list:
        for i in range(len(self.num_list)-1):
            for j in range(i+1,len(self.num_list)):
                if int(self.view_list[i]) < int(self.view_list[j]):
                    self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
                    self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
                    self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
        print len(self.view_list)
        print len(self.title_list)
        print "ok?"
        for v,t,num in zip(self.view_list,self.title_list,self.num_list):
            print "----------***----------\n题目: 《%s》\n浏览量: %s \n第%d篇文章"%(t,v,num)
        # blog_dict = fromkeys()
    # def blog_sort(self):
    #     self.blog_list
    def print_info(self):
        print "本次获取博客数量为:%d"%len(self.blog_list)
        print type(self.total_view)
        print "总阅读量为:%d"%(int(self.total_view))
        print "这批数据的获取时间: %s"%(self.get_time)
def getPage(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    # print("url:",url)
    req = urllib2.Request(url,headers=headers)
    myResponse = urllib2.urlopen(req)
#    print myResponse.info()
    myPage = myResponse.read()
    return myPage
def get_all_urls(now_page,last_page):
    all_url = []
    all_id = []
    while now_page <= last_page:
        print'-----------------------------the %d page ---------------------------------' % (now_page,)
    #    获取网页源码     
        myUrl = baseUrl+'/article/list/'+str(now_page)
        myPage = getPage(myUrl)
        # 获取总访问量
        soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
        # <dd title="6047">
        #                 6047            </dd>
        # https://www.cnblogs.com/cymwill/articles/7574479.html
        total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
        # for t in total_view:
        #     # print "total_view : %s"%total_view
        #     print "t: %s "%t.get_text()
        # total_view = 
        # pattern = re.compile(r'\d+')   # 查找数字
        total_view = total_view[1].get_text().replace(' ','').replace('\n','')
        # total_view = pattern.findall(total_view)
        # print "total_view:%s"%total_view
        idList = re.findall('data-articleid=".*?"',myPage,re.S)
        for id in idList:
            # print("id:",id)
            pattern = re.compile(r'\d+')   # 查找数字
            url_id = pattern.findall(id)
            url_id = str(map(int, url_id))
            url_id = url_id[1:-1]   
        # print("all_url",all_url) 
        # print("length",len(all_url))  
        now_page = now_page + 1
    return all_id,total_view
if __name__ == '__main__':
    blog_set = Blog_set(id = 1,now_page=2,last_page=2)
#    blog_set.print_info()
#    print blog_set.blog_list[-1]

