简单的爬虫框架——百度百科
程序员文章站
2024-02-19 18:37:34
...
URLManager.py
class URLManager:
def __init__(self):
self.new_urls = set() # 用来存放待爬取的网址
self.old_urls = set() # 用来存放已经爬取的网址
def is_new_urls_empty(self):
if len(self.new_urls) > 0:
return False
else:
return True
def add_new_url(self, url):
if url in self.old_urls:
pass
else:
self.new_urls.add(url)
def add_new_urls(self, urls):
for url in urls:
if url in self.old_urls:
pass
else:
self.new_urls.add(url)
def get_new_url(self):
url = self.new_urls.pop()
self.old_urls.add(url)
return url
HTMLDownLoader.py
import urllib.request
class HTMDownLoader:
def get_page(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
}
resqust = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(resqust)
html = response.read().decode('utf-8')
# print(html)
return html
HTMLParser.py
from lxml import etree import urllib.parse class HTMLParser: def parse_content(self, content): self.selector = etree.HTML(content, etree.HTMLParser()) def get_url(self, page_url): items = self.selector.xpath('//div[@class="lemma-summary"]//a') urls = [] for item in items: url = urllib.parse.urljoin(page_url, item.get('href')) # print(url) urls.append(url) return urls def get_abstract(self): title = self.selector.xpath('//h1/text()') abstracts = self.selector.xpath('//div[@class="lemma-summary"]//text()') abstract = str() for t in abstracts: if t != "\n": abstract += t return title[0], abstract
FileWriter.py
import os
class FileWriter:
def __init__(self):
self.path = None
self.name = None
self.file = None
def set_path(self, path):
"""
设置文件保存的目录
:param path: 保存的文件夹
:return:
"""
self.path = path
def set_file_name(self, nane):
self.name = nane
def open_file(self):
os.makedirs(self.path, exist_ok=True)
self.file = open(os.path.join(self.path, self.name), "w", encoding='utf-8')
def write_file(self, title, abstract):
self.file.write(title + " " + abstract)
def close_file(self):
self.file.close()
FrameManager.py
from HTMLDownLoader import HTMDownLoader
from HTMLParser import HTMLParser
from URLManager import URLManager
from FileWriter import FileWriter
import urllib.parse
import random
import time
class FrameManager:
def __init__(self):
self.downloader = HTMDownLoader()
self.parser = HTMLParser()
self.urls = URLManager()
self.file = FileWriter()
self.num = 0
def crawl(self, url):
# 将起始url添加到待爬取的url中
self.urls.add_new_url(url=url)
self.file.set_path(r"D:\tutorial\MyFrame")
self.file.set_file_name("百度百科.txt")
self.file.open_file()
while not self.urls.is_new_urls_empty() and self.num < 5:
current_url = self.urls.get_new_url()
html = self.downloader.get_page(current_url) # 下载网页内容
self.num += 1
print("爬取第", self.num, "个页面完成")
self.parser.parse_content(html)
urls = self.parser.get_url(current_url) # 解析网页内容,获取里面的urls
title, abstract = self.parser.get_abstract()
print(title)
print(abstract)
self.file.write_file(title, abstract)
print("解析第", self.num, "个页面完成")
self.urls.add_new_urls(urls=urls) # 将网页里面的url添加到待爬取的urls
time.sleep(random.randint(5, 15))
self.file.close_file()
if __name__ == "__main__":
framemanager = FrameManager()
keyword = input("请输入待爬取的百科词条")
base_url = "https://baike.baidu.com/item/"
quote = urllib.parse.quote(keyword, encoding="utf-8")
start_url = base_url + quote
framemanager.crawl(start_url)
上一篇: Linux中shell脚本
下一篇: shell基础入门(1)