Python3获取豆瓣图书标签的前20本热门书籍(一)
程序员文章站
2022-05-22 17:02:29
介绍 第一篇主要获取豆瓣的大分类、大分类下的具体分类以及具体分类下的前20本热门书籍,第二篇对获取的数据进行分析。 准备 Python3.6、requests、BeautifulSoup4 演示 代码 ......
介绍
第一篇主要获取豆瓣的大分类、大分类下的具体分类以及具体分类下的前20本热门书籍,第二篇对获取的数据进行分析。
准备
python3.6、requests、beautifulsoup4
演示
代码
# -*- coding: utf-8 -*- # @author: sexy phoenix # @last modified by: sexy phoenix import requests from bs4 import beautifulsoup, soupstrainer #内容解析类 class parse: #解析分类 def parse_tags(self, content): only_div_tags = soupstrainer('div', 'article') soup = beautifulsoup(content, 'lxml', parse_only=only_div_tags) category = {} sub_category = {} # 解析大分类 tag_title_wrapper = soup.find_all('a', 'tag-title-wrapper') for index,tag in enumerate(tag_title_wrapper): category[index] = tag.get('name') # 解析大分类下的具体分类 tagcol = soup.find_all('table', "tagcol") for i,tag in enumerate(soup.find_all('table', "tagcol")): a = tag.find_all('a') sub_category[i] = [] for t in a: sub_category[i].append(t.string) return category, sub_category #解析具体分类前20分书籍 def parse_detail_tag(self, content): detail_conent = [] only_ul_tags = soupstrainer('ul', 'subject-list') soup = beautifulsoup(content, 'lxml', parse_only=only_ul_tags) for li in soup.find_all('li', 'subject-item'): info = li.find('div', 'info') title = info.h2.a.get('title') star = info.find('span', 'rating_nums') extra_info = info.h2.next_sibling.next_sibling.string.split('/') author = extra_info[0].strip() price = extra_info[-1].strip() appraise = star.string appraise_num = star.next_sibling.next_sibling.string.strip() detail_conent.append({ 'title': title, 'price': price, 'author': author, 'appraise':appraise, 'appraise_num': appraise_num }) return detail_conent #内容获取类 class spider: def __init__(self): self.url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' self.tag_url = 'https://book.douban.com/tag/' self.headers = { 'user-agent' : 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/76.0.3809.100 safari/537.36' } self.parse = parse() #获取分类html内容 def get_all_tag(self): data = requests.get(self.url, headers=self.headers) if(data.status_code == requests.codes.ok): return self.parse.parse_tags(data.text) else: print('[error]: get category error') #获取书籍html内容 def get_detail_tag(self, tag_name): data = requests.get(self.tag_url + tag_name, self.headers) if(data.status_code == requests.codes.ok): return self.parse.parse_detail_tag(data.text) else: print('[error]: get sub category error') #显示 def show(self): category, sub_category = self.get_all_tag() print('豆瓣大分类:') for index,value in category.items(): i = index + 1 print("{0}、{1}".format(i, value)) try: key = int(input('请输入您选择的大分类:')) - 1 sub_cate = sub_category[key] for index in range(len(sub_cate)): i = index + 1 print("{0}、{1}".format(i, sub_cate[index])) try: sub_key = int(input('请输入您选择的具体分类:')) - 1 tag_name = sub_cate[sub_key] detail_content = self.get_detail_tag(tag_name) for book in detail_content: print('\n') print(book['title']) print("作者:{0}, 价格:{1}, 评分:{2}{3}".format(book['author'],book['price'], book['appraise'], book['appraise_num'])) print('='*50) except: print('[error]: 具体分类选择错误') except: print('[error]: 大分类选择错误') #入口 if __name__ == '__main__': spider = spider() spider.show()
上一篇: 千古一帝秦始皇,为何有三个父亲?
下一篇: 鼠药穿肠过,佛祖心中留。