爬虫demo_草稿
程序员文章站
2022-07-14 11:17:56
...
# coding:utf-8
#!/usr/bin/env python
'''
Created on 2018年8月18日
@author: XX创意车间
使用python爬取csdn个人博客的访问量,并可视化
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
#当前的博客列表页号
now_page = 1
#最后一页列表号
last_page = 2
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
self.print_info()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
# print(self.url)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2]
# print "title : %s"%title
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title = soup.find('h1',class_ = 'title-article')
title = title.get_text()
# print "title:%s"%title
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
# print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def print_info(self):
print "the %dth blog , title is —— \n《%s》 \n \t page_view is %s \n --------------***----------------"%(self.num,self.title,self.page_view)
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
# print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
return myPage
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
# <dd title="6047">
# 6047 </dd>
#获取的是属性节点,需要传入方法,然后再获取文本信息
# https://www.cnblogs.com/cymwill/articles/7574479.html
total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
# for t in total_view:
# # print "total_view : %s"%total_view
# print "t: %s "%t.get_text()
# total_view =
# pattern = re.compile(r'\d+') # 查找数字
total_view = total_view[1].get_text().replace(' ','').replace('\n','')
# total_view = pattern.findall(total_view)
print "total_view : %s"%total_view
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
title = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
titleList=[]
for items in title:
titleList.append(str(items).lstrip().rstrip())
#利用正则表达式获取博客的访问量
# view = re.findall('<span class="link_view".*?><a href=".*?" title="阅读次数">阅读</a>\((.*?)\)</span>',myPage,re.S)
# viewList=[]
# for items in view:
# viewList.append(str(items).lstrip().rstrip())
#将结果输出
for n in range(len(titleList)):
print('page_view:%s title:%s' % (viewList[n].zfill(4),titleList[n]))
#页号加1
now_page = now_page + 1
i = 1
for id in all_id:
locals()['blog_'+str(i)] = Blog(i,id)
i += 1
最新的,实验室电脑Ubuntu16编辑的,加上了整个博客数据集类
# coding:utf-8
#!/usr/bin/env python
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量,主要用来练手
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
type = sys.getfilesystemencoding()
# if sys.stdout.encoding != 'UTF-8':
# sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
# if sys.stderr.encoding != 'UTF-8':
# sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
# a = "我就是中文啊"
# print(chardet.detect(a))
# print("a:",a)
# print("a:",u'哈哈')
# print("h",u"haha")
#最后一页列表号
last_page = 2
all_url = []
all_id = []
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
self.print_info()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
# title = title.encode('unicode_escape').decode('string_escape')
# # print(chardet.detect(title))
# print("title:",title)
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title = soup.find('h1',class_ = 'title-article')
title = title.get_text().encode('utf-8')
# print(chardet.detect(title))
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
# print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def print_info(self):
print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)
class Blog_set(object):
def __init__(self,last_page):
self.blog_list = self.get_blog(last_page)
self.print_info()
def get_blog(self,last_page):
self.all_id,self.total_view = get_all_urls(last_page)
i = 1
blog_list = []
for id in self.all_id:
# print("id:",id)
tem = locals()['blog_'+str(i)] = Blog(i,id)
blog_list.append(tem)
i += 1
print "so this time the total page_view is %s"%(self.total_view)
return blog_list
def print_info(self):
print "all blogs list is "
print self.total_view
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
# print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
return myPage
def get_all_urls(last_page):
now_page = 2
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
# 获取总访问量
soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
# <dd title="6047">
# 6047 </dd>
#获取的是属性节点,需要传入方法,然后再获取文本信息
# https://www.cnblogs.com/cymwill/articles/7574479.html
total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
# for t in total_view:
# # print "total_view : %s"%total_view
# print "t: %s "%t.get_text()
# total_view =
# pattern = re.compile(r'\d+') # 查找数字
total_view = total_view[1].get_text().replace(' ','').replace('\n','')
# total_view = pattern.findall(total_view)
# print "total_view:%s"%total_view
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
# print("all_url",all_url)
# print("length",len(all_url))
now_page = now_page + 1
return all_id,total_view
if __name__ == '__main__':
blog_set = Blog_set(last_page)
# blog_set.print_info()
print blog_set.blog_list[-1].title
修改时间:18年8月18日22点9分,于实验室电脑——
新增功能:同一批次博客数据集类,根据访问量排序,
未实现功能:还不能拿到爬虫的北京时间,还不会存到数据库MySQL中,还不能可视化
# coding:utf-8
#!/usr/bin/env python
'''
Created on 2018年8月18日22时9分
@author: XX创意车间
使用python爬取csdn个人博客的访问量,并可视化处理
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
#起始页
now_page = 2
#最后一页列表号
last_page = 2
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
self.print_info()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
# title = title.encode('unicode_escape').decode('string_escape')
# # print(chardet.detect(title))
# print("title:",title)
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title = soup.find('h1',class_ = 'title-article')
title = title.get_text().encode('utf-8')
# print(chardet.detect(title))
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
# print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def print_info(self):
print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)
class Blog_set(object):
def __init__(self,now_page,last_page):
self.blog_list = self.get_blog(now_page,last_page)
self.print_info()
self.object2dict()
def get_blog(self,now_page,last_page):
self.all_id,self.total_view = get_all_urls(now_page,last_page)
i = 1
blog_list = []
for id in self.all_id:
# local()可以将字符串变为变量名,以这个变量名建立一个blog对象
# tem将变量地址拿到,然后存到列表中
tem = locals()['blog_'+str(i)] = Blog(i,id)
blog_list.append(tem)
i += 1
print "so this time the total page_view is %s"%(self.total_view)
return blog_list
def object2dict(self):
self.num_list = []
self.title_list = []
self.view_list = []
for blog_address in self.blog_list:
self.num_list.append(blog_address.num)
self.title_list.append(blog_address.title)
self.view_list.append(blog_address.page_view)
for i in range(len(self.num_list)-1):
for j in range(i+1,len(self.num_list)):
if int(self.view_list[i]) < int(self.view_list[j]):
self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
print self.num_list
print self.title_list
print self.view_list
# blog_dict = fromkeys()
# def blog_sort(self):
# self.blog_list
def print_info(self):
print "the number of blogs is %d"%len(self.blog_list)
print self.total_view
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
# print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
return myPage
def get_all_urls(now_page,last_page):
all_url = []
all_id = []
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
# 获取总访问量
soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
# <dd title="6047">
# 6047 </dd>
#获取的是属性节点,需要传入方法,然后再获取文本信息
# https://www.cnblogs.com/cymwill/articles/7574479.html
total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
# for t in total_view:
# # print "total_view : %s"%total_view
# print "t: %s "%t.get_text()
# total_view =
# pattern = re.compile(r'\d+') # 查找数字
total_view = total_view[1].get_text().replace(' ','').replace('\n','')
# total_view = pattern.findall(total_view)
# print "total_view:%s"%total_view
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
# print("all_url",all_url)
# print("length",len(all_url))
now_page = now_page + 1
return all_id,total_view
if __name__ == '__main__':
blog_set = Blog_set(now_page,last_page)
# blog_set.print_info()
print blog_set.blog_list[-1]
添加了一个获取年月日时的小代码:
import time
a = time.localtime()
c = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
print c
————'2018_8_19:14'
所以最新整合的代码:
# coding:utf-8
#!/usr/bin/env python
'''
Created on 2016年2月13日
@author: xingjiarong
使用python爬取csdn个人博客的访问量,主要用来练手
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
import time
#起始页
now_page = 2
#最后一页列表号
last_page = 2
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
self.print_info()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
# title = title.encode('unicode_escape').decode('string_escape')
# # print(chardet.detect(title))
# print("title:",title)
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title = soup.find('h1',class_ = 'title-article')
title = title.get_text().encode('utf-8')
# print(chardet.detect(title))
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
# print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def print_info(self):
print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)
class Blog_set(object):
def __init__(self,now_page,last_page):
self.blog_list = self.get_blog(now_page,last_page)
self.time = self.get_time()
self.print_info()
self.blog_sort()
def get_blog(self,now_page,last_page):
self.all_id,self.total_view = get_all_urls(now_page,last_page)
i = 1
blog_list = []
for id in self.all_id:
# local()可以将字符串变为变量名,以这个变量名建立一个blog对象
# tem将变量地址拿到,然后存到列表中
tem = locals()['blog_'+str(i)] = Blog(i,id)
blog_list.append(tem)
i += 1
print "so this time the total page_view is %s"%(self.total_view)
return blog_list
def get_time(self):
a = time.localtime()
self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
return self.get_time
def blog_sort(self):
self.num_list = []
self.title_list = []
self.view_list = []
for blog_address in self.blog_list:
self.num_list.append(blog_address.num)
self.title_list.append(blog_address.title)
self.view_list.append(blog_address.page_view)
for i in range(len(self.num_list)-1):
for j in range(i+1,len(self.num_list)):
if int(self.view_list[i]) < int(self.view_list[j]):
self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
print len(self.view_list)
print len(self.title_list)
for v,t in zip(self.view_list,self.title_list):
print "----------***----------\n题目: 《%s》\n浏览量: %s"%(t,v)
# blog_dict = fromkeys()
# def blog_sort(self):
# self.blog_list
def print_info(self):
print "the number of blogs is %d"%len(self.blog_list)
print self.total_view
print "这批数据的获取时间: %s"%(self.get_time)
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
# print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
return myPage
def get_all_urls(now_page,last_page):
all_url = []
all_id = []
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
# 获取总访问量
soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
# <dd title="6047">
# 6047 </dd>
#获取的是属性节点,需要传入方法,然后再获取文本信息
# https://www.cnblogs.com/cymwill/articles/7574479.html
total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
# for t in total_view:
# # print "total_view : %s"%total_view
# print "t: %s "%t.get_text()
# total_view =
# pattern = re.compile(r'\d+') # 查找数字
total_view = total_view[1].get_text().replace(' ','').replace('\n','')
# total_view = pattern.findall(total_view)
# print "total_view:%s"%total_view
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
# print("all_url",all_url)
# print("length",len(all_url))
now_page = now_page + 1
return all_id,total_view
if __name__ == '__main__':
blog_set = Blog_set(now_page,last_page)
# blog_set.print_info()
print blog_set.blog_list[-1]
2018年8月31日21点41更新内容:
加入了数据库处理,需要安装pymysql,和Navicat,手动创建一个CSDN的数据库(database);
然后就可以调用库函数,用程序创建表格和保存数据了,美滋滋~
但是这个格式问题,我花了一两天的时间进行调整,,中文数据的格式化总是有问题,因为每次爬虫都需要一段时间,卡的很,后来只好新建了一个demo程序,找到可以用的模式。
关于数据库中的密码我替换成了:密码 ,自己修改一下就好了~
ps:另外,之前遇到的cmd无法输出中文的问题,我直接换成在Spyder中执行程序就好了。编辑和编译一体化美滋滋~
# coding:utf-8
#!/usr/bin/env python
'''
Created on 2018年8月31日
@author: lyl
使用python爬取csdn个人博客的访问量,并用来数据永久化和可视化,主要用来练手
'''
import urllib2
import re
from bs4 import BeautifulSoup
import sys
import chardet
import time
import pymysql
#起始页
now_page = 2
#最后一页列表号
last_page = 2
account = 'hehedadaq'
baseUrl = 'http://blog.csdn.net/'+account
class Blog(object):
def __init__(self,num,id):
self.id = id
self.num = num
self.page = self.get_page()
self.title = self.get_title()
self.page_view = self.get_page_view()
self.print_info()
def get_page(self):
self.url = baseUrl+'/article/details/'+str(self.id)
return getPage(self.url)
def get_title(self):
# pattern = re.compile(r'<h1.*?class="title-article">(.*?)</h1>')
# title = pattern.findall(self.page)
# title = str(title)[2:-2].encode('ascii').decode('unicode_escape')
# title = title.encode('unicode_escape').decode('string_escape')
# # print(chardet.detect(title))
# print("title:",title)
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
title = soup.find('h1',class_ = 'title-article')
title = title.get_text().encode('utf-8')
# print(chardet.detect(title))
return title
def get_page_view(self):
soup = BeautifulSoup(self.page,'html.parser',from_encoding='utf-8')
pattern = re.compile(r'<span.*?class="read-count">阅读数:(.*?)</span>') # 查找数字
# print(self.page)
page_view = pattern.findall(self.page)
page_view = str(page_view)[2:-2]
# title = re.findall('<span class="read-count">(.*?)</span>',self.page,re.S)
# print("the %d blog page_view is %s"%(self.num,page_view))
return page_view
def print_info(self):
print "the %dth blog , and the title is ——\n《%s》\n\t\tthe page_view is %s ! \n-----------***----------"%(self.num,self.title,self.page_view)
class Blog_set(object):
def __init__(self,id,now_page,last_page):
self.id = id
self.blog_list = self.get_blog(now_page,last_page)
self.time = self.get_time()
self.blog_sort()
self.create_table()
def insert_blogs(self):
pass
# 创建表格,这里特地将所有的值都用变量替代了,便于以后的修改
def create_table(self):
#这一步是创建一个基本的连接,第一个参数是IP地址,本地默认是localhost或者127.0.0.1
#第二个是用户名,一般是'root'用户,
#第三个是密码,你的root用户密码,如果是下载默认的话,一般是'root',最好修改一下
#第四个是数据库名称,你自己创建的一个数据库
#第五个是编码格式,一般用utf8,并且不能加-
这里的密码一定要改成自己的密码!这段不能加注释,必须得报错知道才行!
db = pymysql.connect("localhost", "root", "密码", "csdn",charset='utf8')
# 获取cursor光标,固定操作
cursor = db.cursor()
#原MySQL创建表的命令如下:
#creat table if not exists table_name(id int primary key not null, title tiny text)
#table_name是你需要创建的表名称,括号内的id是第一个表头,int是数据格式;
#primary key是设置主键,not null是表示这列不能为空,逗号前都是限定id这一个表头的。
#title也是一个表头。
#而我用变量将这些提取出来了,格式的变化,让我修改了好久的bug!
#tem代表的是括号内的栏目,需要一个单引号扩住
tem = '(blogs_id int primary key not null,title tinytext not null,page_view int not null)'
print tem
#table_id是表名称,也需要是一个字符串,且不能为纯数字!字符串格式的数字也不行!
table_id = 'table_'+str(self.id)
#在pymysql中,用sql这个变量表示创建表的命令,然后传给函数。
#格式化输入中都是传递的字符串变量
sql = 'create table if not exists %s %s'%(table_id, tem)
try:
#努力尝试是否能够执行这句命令
cursor.execute(sql)
print 'Sucessfully create table(%s)!'%table_id
except Exception,e:
print e
print 'Fail to create table(%s)'%table_id
# 向创建的表格添加数据
#在这里讲讲整个程序的思路:
#1、目前主程序只会产生一个blog_set,也就是博客集的类。
#2、创建好了这个类,就会调用Blog类,拿到我所有博客,并且获取到博客的属性。
#3、具体blog_set类属性:id,get_time,blog_num,blog_list,total_view
#4、Blog属性:排序,专有id(URL生成的那种特异id),title,page_view
#5、在博客集这里,我拿到这些属性后,需要建立这个集的博客表,存下这个博客集的所有博客信息
#6、包括排序、专有id、标题、浏览量
#所以下面就是对博客列表进行依次保存。
for blog in self.blog_list:
#
print blog.id
print blog.page_view
blog.id = int(blog.id)
print blog.title
#这里需要注意,因为标题又是中文字符串,所以又得注意好格式
#如果不是变量的话,应该是这样的语句:
#title = '\'中文\''
#sql = 'insert into %s (blogs, title, page_view) values (%d, %s, %d)'%(table_name,id,title,page_view)
#或者更直接一点就是:
#sql = 'insert into ta(blogs, title,page_view) values (12,\'我是题目\',123)'
#而我下面无法对blog.title这个变量进行过多的操作,只能链接两个斜杠了
#没想到还真的有用哎,这个我还真的没有百度到别人的操作。
blog.title = '\''+blog.title+'\''
print blog.title
#下面只是确保是int类型,其实应该没啥用,本来就是int类型
blog.page_view = int(blog.page_view)
#插入数据语句,一定要注意栏目(表头)的顺序,对应起来才可以,还有%和\这两个符号不要弄混
#把insert换成replace,没有就插入,有的话,就替换,不会报错美滋滋~
sql = 'replace into %s (blogs_id,title,page_view) values (%d,%s,%d)'%(table_id,blog.id,blog.title,blog.page_view)
# sql = 'insert into %s (blogs_id, title, page_view) values (%d, %s, %d)'%(table_id,blog.id,blog.title,blog.page_view)
# 执行sql语句
try:
print "努力插入中..."
cursor.execute(sql)
# 提交到数据库执行
db.commit()
print 'sucessfully!'
except Exception, e:
# 如果发生错误则回滚
db.rollback()
print e
print 'failed!'
# 关闭数据库连接
db.close()
def get_blog(self,now_page,last_page):
self.all_id,self.total_view = get_all_urls(now_page,last_page)
i = 1
blog_list = []
for id in self.all_id:
# local()可以将字符串变为变量名,以这个变量名建立一个blog对象
# tem将变量地址拿到,然后存到列表中
tem = locals()['blog_'+str(i)] = Blog(i,id)
blog_list.append(tem)
i += 1
print "so this time the total page_view is %s"%(self.total_view)
return blog_list
def get_time(self):
a = time.localtime()
self.get_time = str(a[0])+"_"+str(a[1])+'_'+str(a[2])+":"+str(a[3])
return self.get_time
def blog_sort(self):
self.num_list = []
self.title_list = []
self.view_list = []
for blog_address in self.blog_list:
self.num_list.append(blog_address.num)
self.title_list.append(blog_address.title)
self.view_list.append(blog_address.page_view)
for i in range(len(self.num_list)-1):
for j in range(i+1,len(self.num_list)):
if int(self.view_list[i]) < int(self.view_list[j]):
self.num_list[i],self.num_list[j] = self.num_list[j],self.num_list[i]
self.view_list[i],self.view_list[j] = self.view_list[j],self.view_list[i]
self.title_list[i],self.title_list[j] = self.title_list[j],self.title_list[i]
print len(self.view_list)
print len(self.title_list)
print "ok?"
for v,t,num in zip(self.view_list,self.title_list,self.num_list):
print "----------***----------\n题目: 《%s》\n浏览量: %s \n第%d篇文章"%(t,v,num)
# blog_dict = fromkeys()
# def blog_sort(self):
# self.blog_list
def print_info(self):
print "本次获取博客数量为:%d"%len(self.blog_list)
print type(self.total_view)
print "总阅读量为:%d"%(int(self.total_view))
print "这批数据的获取时间: %s"%(self.get_time)
def getPage(url):
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造请求
# print("url:",url)
req = urllib2.Request(url,headers=headers)
#访问页面
myResponse = urllib2.urlopen(req)
# print myResponse.info()
myPage = myResponse.read()
return myPage
def get_all_urls(now_page,last_page):
all_url = []
all_id = []
while now_page <= last_page:
print'-----------------------------the %d page ---------------------------------' % (now_page,)
# 获取网页源码
myUrl = baseUrl+'/article/list/'+str(now_page)
myPage = getPage(myUrl)
# 获取总访问量
soup = BeautifulSoup(myPage,'html.parser',from_encoding='utf-8')
# <dd title="6047">
# 6047 </dd>
#获取的是属性节点,需要传入方法,然后再获取文本信息
# https://www.cnblogs.com/cymwill/articles/7574479.html
total_view = soup.find('div',class_ = 'grade-box clearfix').find_all(lambda tag:tag.has_attr('title'))
# for t in total_view:
# # print "total_view : %s"%total_view
# print "t: %s "%t.get_text()
# total_view =
# pattern = re.compile(r'\d+') # 查找数字
total_view = total_view[1].get_text().replace(' ','').replace('\n','')
# total_view = pattern.findall(total_view)
# print "total_view:%s"%total_view
idList = re.findall('data-articleid=".*?"',myPage,re.S)
for id in idList:
# print("id:",id)
pattern = re.compile(r'\d+') # 查找数字
url_id = pattern.findall(id)
url_id = str(map(int, url_id))
url_id = url_id[1:-1]
all_id.append(url_id)
all_url.append(baseUrl+'/'+url_id)
# print("all_url",all_url)
# print("length",len(all_url))
now_page = now_page + 1
return all_id,total_view
if __name__ == '__main__':
blog_set = Blog_set(id = 1,now_page=2,last_page=2)
# blog_set.print_info()
# print blog_set.blog_list[-1]
最后来一张运行结果图吧:
上一篇: openCV:霍夫直线变换、霍夫圆检测
下一篇: 【Python第六课】爬虫实战进阶