python基础爬虫——使用深度优先和广度优先爬取图结构网站
程序员文章站
2022-03-03 12:25:42
...
困于心衡于虑而后作
今天要学习的目标是:深度优先爬取复杂网站,图结构网站
1.深度优先爬取图结构网站
代码及解释如下:
from bs4 import BeautifulSoup
import urllib.request
class Stack:
def __init__(self):
self.st = []
def pop(self):
return self.st.pop()
def push(self, obj):
self.st.append(obj)
def empty(self):
return len(self.st) == 0
def visit(url):
global urls
if url in urls:
return []
urls.append(url)
try:
data = urllib.request.urlopen(url)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
print(soup.find('h3').text)
links = soup.select('a')
return links
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000/"
urls = []
def spider(url):
links = visit(url)
for link in links:
url = start_url + link['href']
spider(url)
def DFS(): # 深度优先访问站点
st = Stack()
st.push(start_url + 'books.htm')
while not st.empty():
url = st.pop()
links = visit(url)
for link in links: # 从左到右
url = start_url + link['href']
st.push(url)
#
# for i in range(len(links) - 1, -1, -1): # 从右到左
# url = start_url + links[i]['href']
# st.push(url)
# 深度优先方法访问站点(图的结构)
urls = []
spider(start_url + 'books.htm') # 递归的方法得到的
print() # 输出回车
urls = []
DFS() # 通过压栈的方法得到,可以控制从左到右还有从右到左
print('the end')
# def spider(url):
# global urls
# stack = Stack()
# stack.push(url)
# while not stack.empty():
# url = stack.pop()
# if url not in urls:
# urls.append(url)
# try:
# data = urllib.request.urlopen(url)
# data = data.read()
# data = data.decode()
# soup = BeautifulSoup(data, 'lxml')
# print(soup.find('h3').text)
# links = soup.select('a')
# for i in range(len(links) - 1, -1, -1):
# href = links[i]['href']
# url = start_url + '/' + href
# stack.push(url)
# except Exception as err:
# print(err)
# start_url = "http://127.0.0.1:5000"
# urls = []
# spider(start_url)
# print('the end')
运行结果:
2.广度优先爬取图网站
python代码:
from bs4 import BeautifulSoup
import urllib.request
# 广度优先队列实现爬取图结构网站
class Queue:
def __init__(self):
self.st = []
def fetch(self):
return self.st.pop(0)
def enter(self, obj):
self.st.append(obj)
def empty(self):
return len(self.st) == 0
def visit(url):
global urls
if url in urls:
return []
urls.append(url)
try:
data = urllib.request.urlopen(url)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
print(soup.find('h3').text)
links = soup.select('a')
return links
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000/"
urls = []
def spider(url):
global urls
q = Queue()
q.enter(start_url)
while not q.empty():
url = q.fetch()
links = visit(url)
for link in links:
url = start_url + link['href']
q.enter(url)
spider(start_url)
print('the end')
运行结果:
上一篇: Spring+iBatis+Atomikos实现JTA事务
下一篇: spring学习