Python爬取网站树(网络爬虫)
程序员文章站
2022-07-08 18:06:20
一、网页文件:
(1)book.html
<h3>计算机</h3>
<ul>
<li><a href="database.html">数据库</a></li>
<li><a href="program.html">程序设计</a></li>
<li><a href="network.html">计算机网络</a></li>
</ul>
(2)database.html
<h3>数据库</h3>
<ul>
<li><a href="mysql.html">MySQL 数据库</a></li>
</ul>
(3) program.html
<h3>程序设计</h3>
<ul>
<li><a href="python.html">Python 程序设计</a></li>
<li><a href="java.html">Java 程序设计</a></li>
</ul>
(4) network.html
<h3>计算机网络</h3>
(5)mysql.html
<h3>MySQL 数据库</h3>
(6)python.html
<h3>Python 程序设计</h3>
(7) java.html
<h3>Java 程序设计</h3>
二、服务端:
import flask
import os
app = flask.Flask(__name__)
def getFile(fileName):
data = b""
if os.path.exists(fileName):
fobj = open(fileName, "rb")
data = fobj.read()
fobj.close()
return data
@app.route("/")
def index():
return getFile("books.html")
@app.route("/<section>")
def process(section):
data = ""
if section != "":
data = getFile(section)
return data
if __name__ == "__main__":
app.run()
三、客户端:
①深度优先爬取数据
# coding=gbk
from bs4 import BeautifulSoup
import urllib.request
class Stack:
def __init__(self):
self.st=[]
def pop(self):
return self.st.pop()
def push(self,obj):
self.st.append(obj)
def empty(self):
return len(self.st)==0
def spider(url):
stack=Stack()
stack.push(url)
while not stack.empty():
url=stack.pop()
try:
data=urllib.request.urlopen(url)
data=data.read()
data=data.decode()
soup=BeautifulSoup(data,"html.parser")
print(soup.find("h3").text)
links=soup.select("a") # 得到一个列表
for i in range(len(links)-1,-1,-1):
href=links[i]["href"]
url=start_url+"/"+href
stack.push(url)
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000"
spider(start_url)
print("The End")
②广度优先爬取数据:
# coding=gbk
from bs4 import BeautifulSoup
import urllib.request
class Queue:
def __init__(self):
self.st=[]
def fetch(self):
return self.st.pop(0)
def enter(self,obj):
self.st.append(obj)
def empty(self):
return len(self.st)==0
def spider(url):
queue=Queue()
queue.enter(url)
while not queue.empty():
url=queue.fetch()
try:
data=urllib.request.urlopen(url)
data=data.read()
data=data.decode()
soup=BeautifulSoup(data,"html.parser")
print(soup.find("h3").text)
links=soup.select("a") # 得到一个列表
for link in links:
href=link["href"]
url=start_url+"/"+href
queue.enter(url)
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000"
spider(start_url)
print("The End")
四、补充:
list = [1,2,3,4]
# print(list.pop(0)) # 弹出队列的第一个元素
print(list.pop()) # 默认弹出队列的最后一个元素
list = [1, 2, 3, 4, 5]
for i in range(len(list) - 1, -1, -1): # range(start,end,step)
print(list[i])
本文地址:https://blog.csdn.net/belongname_/article/details/109632971
上一篇: 蓝牙商标申请被驳回,爱立信5年努力告败
下一篇: 少走弯路,给Java程序员的建议