欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python基础爬虫——使用深度优先和广度优先爬取图结构网站

程序员文章站 2022-03-03 12:25:42
...

困于心衡于虑而后作
今天要学习的目标是:深度优先爬取复杂网站,图结构网站

1.深度优先爬取图结构网站
代码及解释如下:

from bs4 import BeautifulSoup
import urllib.request


class Stack:
    def __init__(self):
        self.st = []

    def pop(self):
        return self.st.pop()

    def push(self, obj):
        self.st.append(obj)

    def empty(self):
        return len(self.st) == 0


def visit(url):
    global urls
    if url in urls:
        return []
    urls.append(url)
    try:
        data = urllib.request.urlopen(url)
        data = data.read()
        data = data.decode()
        soup = BeautifulSoup(data, 'lxml')
        print(soup.find('h3').text)
        links = soup.select('a')
        return links
    except Exception as err:
        print(err)


start_url = "http://127.0.0.1:5000/"
urls = []


def spider(url):
    links = visit(url)
    for link in links:
        url = start_url + link['href']
        spider(url)


def DFS():  # 深度优先访问站点
    st = Stack()
    st.push(start_url + 'books.htm')
    while not st.empty():
        url = st.pop()
        links = visit(url)
        for link in links:  # 从左到右
            url = start_url + link['href']
            st.push(url)
        #
        # for i in range(len(links) - 1, -1, -1):  # 从右到左
        #     url = start_url + links[i]['href']
        #     st.push(url)


# 深度优先方法访问站点(图的结构)
urls = []
spider(start_url + 'books.htm')  # 递归的方法得到的
print()  # 输出回车

urls = []
DFS()  # 通过压栈的方法得到,可以控制从左到右还有从右到左

print('the end')

# def spider(url):
#     global urls
#     stack = Stack()
#     stack.push(url)
#     while not stack.empty():
#         url = stack.pop()
#         if url not in urls:
#             urls.append(url)
#             try:
#                 data = urllib.request.urlopen(url)
#                 data = data.read()
#                 data = data.decode()
#                 soup = BeautifulSoup(data, 'lxml')
#                 print(soup.find('h3').text)
#                 links = soup.select('a')
#                 for i in range(len(links) - 1, -1, -1):
#                     href = links[i]['href']
#                     url = start_url + '/' + href
#                     stack.push(url)
#             except Exception as err:
#                 print(err)


# start_url = "http://127.0.0.1:5000"
# urls = []
# spider(start_url)
# print('the end')
 

运行结果:
python基础爬虫——使用深度优先和广度优先爬取图结构网站
2.广度优先爬取图网站

python代码:

from bs4 import BeautifulSoup
import urllib.request

# 广度优先队列实现爬取图结构网站
class Queue:
    def __init__(self):
        self.st = []

    def fetch(self):
        return self.st.pop(0)

    def enter(self, obj):
        self.st.append(obj)

    def empty(self):
        return len(self.st) == 0


def visit(url):
    global urls
    if url in urls:
        return []
    urls.append(url)
    try:
        data = urllib.request.urlopen(url)
        data = data.read()
        data = data.decode()
        soup = BeautifulSoup(data, 'lxml')
        print(soup.find('h3').text)
        links = soup.select('a')
        return links
    except Exception as err:
        print(err)

start_url = "http://127.0.0.1:5000/"
urls = []


def spider(url):
    global urls
    q = Queue()
    q.enter(start_url)
    while not q.empty():
        url = q.fetch()
        links = visit(url)
        for link in links:
            url = start_url + link['href']
            q.enter(url)


spider(start_url)
print('the end')

运行结果:
python基础爬虫——使用深度优先和广度优先爬取图结构网站

相关标签: python python