一个简单的python爬虫,爬取知乎
程序员文章站
2022-06-15 18:18:00
一个简单的python爬虫,爬取知乎,主要实现 爬取一个收藏夹 里 所有问题答案下的 图片、文字信息暂未收录,可自行实现,比图片更简单、具体代码里有详细注释,请自行阅读 ......
一个简单的python爬虫,爬取知乎
- 主要实现 爬取一个收藏夹 里 所有问题答案下的 图片
- 文字信息暂未收录,可自行实现,比图片更简单
- 具体代码里有详细注释,请自行阅读
项目源码:
1 # -*- coding:utf-8 -*- 2 3 from spider import spiderhtml 4 from multiprocessing import pool 5 import sys,urllib,http,os,random,re,time 6 __author__ = 'waiting' 7 ''' 8 使用了第三方的类库 beautifulsoup4,请自行安装 9 需要目录下的spider.py文件 10 运行环境:python3.4,windows7 11 ''' 12 13 #收藏夹的地址 14 url = 'https://www.zhihu.com/collection/30822111' #page参数改为代码添加 15 16 #本地存放的路径,不存在会自动创建 17 store_path = 'e:\\zhihu\收藏夹\\会员才知道的世界' 18 19 class zhihucollectionspider(spiderhtml): 20 def __init__(self,pagestart, pageend, url): 21 self._url = url 22 self._pagestart = int(pagestart) 23 self._pageend = int(pageend)+1 24 self.downlimit = 0 #低于此赞同的答案不收录 25 26 def start(self): 27 for page in range(self._pagestart,self._pageend): #收藏夹的页数 28 url = self._url + '?page='+str(page) 29 content = self.geturl(url) 30 questionlist = content.find_all('div',class_='zm-item') 31 for question in questionlist: #收藏夹的每个问题 32 qtitle = question.find('h2',class_='zm-item-title') 33 if qtitle is none: #被和谐了 34 continue 35 36 questionstr = qtitle.a.string 37 qurl = 'https://www.zhihu.com'+qtitle.a['href'] #问题题目 38 qtitle = re.sub(r'[\\/:*?"<>]','#',qtitle.a.string) #windows文件/目录名不支持的特殊符号 39 try: 40 print('-----正在获取问题:'+qtitle+'-----') #获取到问题的链接和标题,进入抓取 41 except unicodeencodeerror: 42 print(r'---问题含有特殊字符无法显示---') 43 try: 44 qcontent = self.geturl(qurl) 45 except: 46 print('!!!!获取出错!!!!!') 47 pass 48 answerlist = qcontent.find_all('div',class_='zm-item-answer zm-item-expanded') 49 self._processanswer(answerlist,qtitle) #处理问题的答案 50 time.sleep(5) 51 52 53 def _processanswer(self,answerlist,qtitle): 54 j = 0 55 for answer in answerlist: 56 j = j + 1 57 58 upvoted = int(answer.find('span',class_='count').string.replace('k','000')) #获得此答案赞同数 59 if upvoted < self.downlimit: 60 continue 61 authorinfo = answer.find('div',class_='zm-item-answer-author-info') #获取作者信息 62 author = {'introduction':'','link':''} 63 try: 64 author['name'] = authorinfo.find('a',class_='author-link').string #获得作者的名字 65 author['introduction'] = str(authorinfo.find('span',class_='bio')['title']) #获得作者的简介 66 author['link'] = authorinfo.find('a',class_='author-link')['href'] 67 except attributeerror: 68 author['name'] = '匿名用户'+str(j) 69 except typeerror: #简介为空的情况 70 pass #匿名用户没有链接 71 72 file_name = os.path.join(store_path,qtitle,'info',author['name']+'_info.txt') 73 if os.path.exists(file_name): #已经抓取过 74 continue 75 76 self.savetext(file_name,'{introduction}\r\n{link}'.format(**author)) #保存作者的信息 77 print('正在获取用户`{name}`的答案'.format(**author)) 78 answercontent = answer.find('div',class_='zm-editable-content clearfix') 79 if answercontent is none: #被举报的用户没有答案内容 80 continue 81 82 imgs = answercontent.find_all('img') 83 if len(imgs) == 0: #答案没有上图 84 pass 85 else: 86 self._getimgfromanswer(imgs,qtitle,**author) 87 88 #收录图片 89 def _getimgfromanswer(self,imgs,qtitle,**author): 90 i = 0 91 for img in imgs: 92 if 'inline-image' in img['class']: #不抓取知乎的小图 93 continue 94 i = i + 1 95 imgurl = img['src'] 96 extension = os.path.splitext(imgurl)[1] 97 path_name = os.path.join(store_path,qtitle,author['name']+'_'+str(i)+extension) 98 try: 99 self.saveimg(imgurl,path_name) #捕获各种图片异常,流程不中断 100 except: 101 pass 102 103 #收录文字 104 def _gettextfromanswer(self): 105 pass 106 107 #命令行下运行,例:zhihu.py 1 5 获取1到5页的数据 108 if __name__ == '__main__': 109 page, limit, paramsnum= 1, 0, len(sys.argv) 110 if paramsnum>=3: 111 page, pageend = sys.argv[1], sys.argv[2] 112 elif paramsnum == 2: 113 page = sys.argv[1] 114 pageend = page 115 else: 116 page,pageend = 1,1 117 118 spider = zhihucollectionspider(page,pageend,url) 119 spider.start()
很多初学者,对python的概念都是模糊不清的,c语言、python能做什么,学的时候,该按照什么线路去学习,学完往哪方面发展,想深入了解,详情可以点击有道云笔记链接了解:http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa