欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

简单的爬虫框架——百度百科

程序员文章站 2024-02-19 18:37:34
...
URLManager.py
class URLManager:
	def __init__(self):
		self.new_urls = set()  # 用来存放待爬取的网址
		self.old_urls = set()  # 用来存放已经爬取的网址

	def is_new_urls_empty(self):
		if len(self.new_urls) > 0:
			return False
		else:
			return True

	def add_new_url(self, url):
		if url in self.old_urls:
			pass
		else:
			self.new_urls.add(url)

	def add_new_urls(self, urls):
		for url in urls:
			if url in self.old_urls:
				pass
			else:
				self.new_urls.add(url)

	def get_new_url(self):
		url = self.new_urls.pop()
		self.old_urls.add(url)
		return url
HTMLDownLoader.py
import urllib.request


class HTMDownLoader:

	def get_page(self, url):
		headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
		}
		resqust = urllib.request.Request(url=url, headers=headers)
		response = urllib.request.urlopen(resqust)
		html = response.read().decode('utf-8')
		# print(html)
		return html
HTMLParser.py
from lxml import etree
import urllib.parse


class HTMLParser:
   def parse_content(self, content):
      self.selector = etree.HTML(content, etree.HTMLParser())

   def get_url(self, page_url):
      items = self.selector.xpath('//div[@class="lemma-summary"]//a')
      urls = []
      for item in items:
         url = urllib.parse.urljoin(page_url, item.get('href'))
         # print(url)
         urls.append(url)
      return urls

   def get_abstract(self):
      title = self.selector.xpath('//h1/text()')
      abstracts = self.selector.xpath('//div[@class="lemma-summary"]//text()')
      abstract = str()
      for t in abstracts:
         if t != "\n":
            abstract += t
      return title[0], abstract
FileWriter.py
import os


class FileWriter:
	def __init__(self):
		self.path = None
		self.name = None
		self.file = None

	def set_path(self, path):
		"""
		设置文件保存的目录
		:param path: 保存的文件夹
		:return:
		"""
		self.path = path

	def set_file_name(self, nane):
		self.name = nane

	def open_file(self):
		os.makedirs(self.path, exist_ok=True)
		self.file = open(os.path.join(self.path, self.name), "w", encoding='utf-8')

	def write_file(self, title, abstract):
		self.file.write(title + "	" + abstract)

	def close_file(self):
		self.file.close()
FrameManager.py
from HTMLDownLoader import HTMDownLoader
from HTMLParser import HTMLParser
from URLManager import URLManager
from FileWriter import FileWriter

import urllib.parse
import random
import time


class FrameManager:
	def __init__(self):
		self.downloader = HTMDownLoader()
		self.parser = HTMLParser()
		self.urls = URLManager()
		self.file = FileWriter()

		self.num = 0

	def crawl(self, url):
		#  将起始url添加到待爬取的url中
		self.urls.add_new_url(url=url)
		self.file.set_path(r"D:\tutorial\MyFrame")
		self.file.set_file_name("百度百科.txt")
		self.file.open_file()
		while not self.urls.is_new_urls_empty() and self.num < 5:
			current_url = self.urls.get_new_url()

			html = self.downloader.get_page(current_url)	# 下载网页内容
			self.num += 1
			print("爬取第", self.num, "个页面完成")
			self.parser.parse_content(html)
			urls = self.parser.get_url(current_url)   # 解析网页内容,获取里面的urls
			title, abstract = self.parser.get_abstract()
			print(title)
			print(abstract)
			self.file.write_file(title, abstract)
			print("解析第", self.num, "个页面完成")
			self.urls.add_new_urls(urls=urls)               # 将网页里面的url添加到待爬取的urls

			time.sleep(random.randint(5, 15))
		self.file.close_file()


if __name__ == "__main__":
	framemanager = FrameManager()
	keyword = input("请输入待爬取的百科词条")
	base_url = "https://baike.baidu.com/item/"
	quote = urllib.parse.quote(keyword, encoding="utf-8")
	start_url = base_url + quote
	framemanager.crawl(start_url)