欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

Python爬虫【解析库之beautifulsoup】

程序员文章站 2023-04-03 14:46:27
解析库的安装 初始化 BeautifulSoup(str,"解析库") html='''

Hello

    解析库的安装

    pip3 install beautifulsoup4

    初始化 beautifulsoup(str,"解析库")

    from bs4 import beautifulsoup

    html='''
    <div class="panel">
    <div class="panel-heading">
    <h4>hello</h4>
    </div>
    <div class="panel-body">
    <ul class="list" id="list-1">
    <li class="element">foo</li>
    <li class="element">bar</li>
    <li class="element">jay</li>
    </ul>
    <ul class="list list-small" id="list-2">
    <li class="element">foo</li>
    <li class="element">bar</li>
    </ul>
    </div>
    </div>
    '''

    soup = beautifulsoup(html,"lxml") # soup = beautifulsoup(html,"html.parser")

    标签选择器

    选择元素 soup.e

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html><head><title>the dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>the dormouse's story</b></p>
    <p class="story">once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.title)
    print(type(soup.title))
    print(soup.head)
    print(soup.p)
    
    """
    打印结果:
    <title>the dormouse's story</title>
    <class 'bs4.element.tag'>
    <head><title>the dormouse's story</title></head>
    <p class="title" name="dromouse"><b>the dormouse's story</b></p>
    """
    选择元素

    获取名称 soup.e.name

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html><head><title>the dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>the dormouse's story</b></p>
    <p class="story">once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.title.name) # title
    获取名称

    获取属性  soup.e.attrs[ ]  or  soup.e[ ]

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html><head><title>the dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>the dormouse's story</b></p>
    <p class="story">once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.p.attrs['name'])
    print(soup.p['name'])
    """
    dromouse
    dromouse
    """
    获取属性

    获取内容 soup.e.string

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html><head><title>the dormouse's story</title></head>
    <body>
    <p clss="title" name="dromouse"><b>the dormouse's story</b></p>
    <p class="story">once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.p.string)
    """
    the dormouse's story
    """
    获取内容

    嵌套选择 soup.e.e

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html><head><title>the dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>the dormouse's story</b></p>
    <p class="story">once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.head.title.string)
    """
    the dormouse's story
    """
    view code

    子节点 soup.e.contents

    Python爬虫【解析库之beautifulsoup】
     1 html = """
     2 <html>
     3     <head>
     4         <title>the dormouse's story</title>
     5     </head>
     6     <body>
     7         <p class="story">
     8             once upon a time there were three little sisters; and their names were
     9             <a href="http://example.com/elsie" class="sister" id="link1">
    10                 <span>elsie</span>
    11             </a>
    12             <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
    13             and
    14             <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
    15             and they lived at the bottom of a well.
    16         </p>
    17         <p class="story">...</p>
    18 """
    19 from bs4 import beautifulsoup
    20 soup = beautifulsoup(html, 'lxml')
    21 print(soup.p.contents)
    22 """
    23 ['\n            once upon a time there were three little sisters; and their names were\n            ', <a class="sister" href="http://example.com/elsie" id="link1">
    24 <span>elsie</span>
    25 </a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">lacie</a>, ' \n            and\n            ', <a class="sister" href="http://example.com/tillie" id="link3">tillie</a>, '\n            and they lived at the bottom of a well.\n        ']
    26 
    27 """
    view code

    子节点 soup.e.children

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html>
        <head>
            <title>the dormouse's story</title>
        </head>
        <body>
            <p class="story">
                once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.p.children)
    for i, child in enumerate(soup.p.children):
        print(i, child)
    """
    <list_iterator object at 0x00b116d0>
    0 
                once upon a time there were three little sisters; and their names were
                
    1 <a class="sister" href="http://example.com/elsie" id="link1">
    <span>elsie</span>
    </a>
    2 
    
    3 <a class="sister" href="http://example.com/lacie" id="link2">lacie</a>
    4  
                and
                
    5 <a class="sister" href="http://example.com/tillie" id="link3">tillie</a>
    6 
                and they lived at the bottom of a well.
    """
    view code

    子孙节点 soup.e.descendants 包括标签里面的文本都属于

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html>
        <head>
            <title>the dormouse's story</title>
        </head>
        <body>
            <p class="story">
                once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.p.descendants)
    for i, child in enumerate(soup.p.descendants):
        print(i, child)
    """
    <generator object descendants at 0x03ebd420>
    0 
                once upon a time there were three little sisters; and their names were
                
    1 <a class="sister" href="http://example.com/elsie" id="link1">
    <span>elsie</span>
    </a>
    2 
    
    3 <span>elsie</span>
    4 elsie
    5 
    
    6 
    
    7 <a class="sister" href="http://example.com/lacie" id="link2">lacie</a>
    8 lacie
    9  
                and
                
    10 <a class="sister" href="http://example.com/tillie" id="link3">tillie</a>
    11 tillie
    12 
                and they lived at the bottom of a well.
    """
    view code

    父节点 soup.e.parent

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html>
        <head>
            <title>the dormouse's story</title>
        </head>
        <body>
            <p class="story">
                once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.a.parent)
    view code

    祖先节点 soup.e.parents

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html>
        <head>
            <title>the dormouse's story</title>
        </head>
        <body>
            <p class="story">
                once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(list(enumerate(soup.a.parents)))
    view code

    兄弟节点 soup.e.next_siblings  soup.e.previous_siblings

    Python爬虫【解析库之beautifulsoup】
    html = """
    <html>
        <head>
            <title>the dormouse's story</title>
        </head>
        <body>
            <p class="story">
                once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(list(enumerate(soup.a.next_siblings)))
    print(list(enumerate(soup.a.previous_siblings)))
    view code

    标准选择器

    find_all( name , attrs , recursive , text , **kwargs )

    可根据标签名、属性、内容查找文档

    标签名获取 soup.find_all('name')

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.find_all('ul'))
    view code

    属性获取 soup.find_all(attrs={})

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.find_all(attrs={'id': 'list-1'}))
    print(soup.find_all(attrs={'name': 'elements'}))
    view code
    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.find_all(id='list-1'))
    print(soup.find_all(class_='element'))
    第二种

    文本内容获取

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.find_all(text='foo'))
    view code

    find( name , attrs , recursive , text , **kwargs )

    find返回单个元素,find_all返回所有元素

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.find('ul'))
    print(type(soup.find('ul')))
    print(soup.find('page'))

    find_parents() find_parent() 

    find_parents()返回所有祖先节点,find_parent()返回直接父节点。

    find_next_siblings() find_next_sibling()

    find_next_siblings()返回后面所有兄弟节点,find_next_sibling()返回后面第一个兄弟节点。

    find_previous_siblings() find_previous_sibling()

    find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。

    find_all_next() find_next()

    find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点

    find_all_previous() 和 find_previous()

    find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

    css选择器

    通过select()直接传入css选择器即可完成选择 soup.select(css选择器)

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    print(soup.select('.panel .panel-heading'))
    print(soup.select('ul li'))
    print(soup.select('#list-2 .element'))
    print(type(soup.select('ul')[0]))
    view code
    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul.select('li'))
    view code

    获取属性 e.attrs[] or  e[]

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul['id'])
        print(ul.attrs['id'])
    view code

    获取内容 get_text()

    Python爬虫【解析库之beautifulsoup】
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">foo</li>
                <li class="element">bar</li>
                <li class="element">jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">foo</li>
                <li class="element">bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import beautifulsoup
    soup = beautifulsoup(html, 'lxml')
    for li in soup.select('li'):
        print(li.get_text())
    view code

    总结

    • 推荐使用lxml解析库,必要时使用html.parser
    • 标签选择筛选功能弱但是速度快
    • 建议使用find()、find_all() 查询匹配单个结果或者多个结果
    • 如果对css选择器熟悉建议使用select()
    • 记住常用的获取属性和文本值的方法

    本文代码皆来自崔庆才《python3网络爬虫开发实战》