python Beautiful Soup 采集it books pdf,免费下载
程序员文章站
2022-03-22 12:03:24
http://www.allitebooks.org/ 是我见过最良心的网站,所有书籍免费下载 周末无聊,尝试采集此站所有Pdf书籍。 采用技术 python3.5 Beautiful soup 分享代码 最简单的爬虫,没有考虑太多的容错,建议大家尝试的时候,温柔点, 别把这个良心网站搞挂掉了 ......
是我见过最良心的网站,所有书籍免费下载
周末无聊,尝试采集此站所有pdf书籍。
采用技术
- python3.5
-
beautiful soup
分享代码
最简单的爬虫,没有考虑太多的容错,建议大家尝试的时候,温柔点,别把这个良心网站搞挂掉了
# www.qingmiaokeji.cn 30 from bs4 import beautifulsoup import requests import json siteurl = 'http://www.allitebooks.org/' def category(): response = requests.get(siteurl) # print(response.text) categoryurl = [] soup = beautifulsoup(response.text,"html.parser") for a in soup.select('.sub-menu li a'): categoryurl.append({'name':a.get_text(),'href':a.get("href")}) return categoryurl def bookurllist(url): # urls = [] response = requests.get(url['href']) soup = beautifulsoup(response.text,"html.parser") a = soup.select(".pagination a[title='last page →']") nums = 0 for e in a: nums = int(e.get_text()) # print(e.get_text()) for i in range(1,nums+1): # print(url+"page/"+str(i)) # urls.append(url+"page/"+str(i)) booklist(url['href']+"page/"+str(i)) def booklist(url): response = requests.get(url) soup = beautifulsoup(response.text,"html.parser") article = soup.select(".main-content-inner article .entry-title a") for i in article: url = i.get("href") getbookdetail(url) def getbookdetail(url): response = requests.get(url) soup = beautifulsoup(response.text,"html.parser") title = soup.select(".single-title")[0].text imgurl = soup.select(".entry-body-thumbnail .attachment-post-thumbnail")[0].get("src") downloadpdfurl = soup.select(".download-links a")[0].get("href") with open('d:/booklist.txt', 'a+',encoding='utf-8') as f: f.write(title+" | ![]("+imgurl+") | "+ downloadpdfurl+"\n") if __name__ == '__main__': list = category() for url in list: bookurllist(url)