Python解析html的几种操作方式

程序员文章站 2022-03-13 11:52:05

解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。先介绍基础的辅助函数，主要用于获取html并输入解析后的结束 #把传递解析函数，便于下面的...

解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。
先介绍基础的辅助函数，主要用于获取html并输入解析后的结束

#把传递解析函数，便于下面的修改
def get_html(url, paraser=bs4_paraser):
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Host': 'www.360kan.com',
        'Proxy-Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    }
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    response.encoding = 'utf-8'
    if response.code == 200:
        data = StringIO.StringIO(response.read())
        gzipper = gzip.GzipFile(fileobj=data)
        data = gzipper.read()
        value = paraser(data)  # open('E:/h5/haPkY0osd0r5UB.html').read()
        return value
    else:
        pass


value = get_html('https://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
    print row

1，lxml.html的方式进行解析，
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ.
[官网](https://lxml.de/)

def lxml_parser(page):
    data = []
    doc = etree.HTML(page)
    all_p = doc.xpath('//p[@class="yingping-list-wrap"]')
    for row in all_p:
        # 获取每一个影评，即影评的item
        all_p_item = row.xpath('.//p[@class="item"]')  # find_all('p', attrs={'class': 'item'})
        for r in all_p_item:
            value = {}
            # 获取影评的标题部分
            title = r.xpath('.//p[@class="g-clear title-wrap"][1]')
            value['title'] = title[0].xpath('./a/text()')[0]
            value['title_href'] = title[0].xpath('./a/@href')[0]
            score_text = title[0].xpath('./p/span/span/@style')[0]
            score_text = re.search(r'\d+', score_text).group()
            value['score'] = int(score_text) / 20
            # 时间
            value['time'] = title[0].xpath('./p/span[@class="time"]/text()')[0]
            # 多少人喜欢
            value['people'] = int(
                    re.search(r'\d+', title[0].xpath('./p[@class="num"]/span/text()')[0]).group())
            data.append(value)
    return data

2，使用BeautifulSoup，不多说了，推荐一篇讲解非常好的文章
[应用讲解](https://www.bkjia.com/Pythonjc/992499.html%20%E5%BA%94%E7%94%A8%E8%AE%B2%E8%A7%A3)

def bs4_paraser(html):
    all_value = []
    value = {}
    soup = BeautifulSoup(html, 'html.parser')
    # 获取影评的部分
    all_p = soup.find_all('p', attrs={'class': 'yingping-list-wrap'}, limit=1)
    for row in all_p:
        # 获取每一个影评，即影评的item
        all_p_item = row.find_all('p', attrs={'class': 'item'})
        for r in all_p_item:
            # 获取影评的标题部分
            title = r.find_all('p', attrs={'class': 'g-clear title-wrap'}, limit=1)
            if title is not None and len(title) > 0:
                value['title'] = title[0].a.string
                value['title_href'] = title[0].a['href']
                score_text = title[0].p.span.span['style']
                score_text = re.search(r'\d+', score_text).group()
                value['score'] = int(score_text) / 20
                # 时间
                value['time'] = title[0].p.find_all('span', attrs={'class': 'time'})[0].string
                # 多少人喜欢
                value['people'] = int(
                        re.search(r'\d+', title[0].find_all('p', attrs={'class': 'num'})[0].span.string).group())
            # print r
            all_value.append(value)
            value = {}
    return all_value

3，使用SGMLParser，主要是通过start、end tag的方式进行了，解析工程比较明朗，但是有点麻烦，而且该案例的场景不太适合该方法，（哈哈）

class CommentParaser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.__start_p_yingping = False
        self.__start_p_item = False
        self.__start_p_gclear = False
        self.__start_p_ratingwrap = False
        self.__start_p_num = False
        # a
        self.__start_a = False
        # span 3中状态
        self.__span_state = 0
        # 数据
        self.__value = {}
        self.data = []

    def start_p(self, attrs):
        for k, v in attrs:
            if k == 'class' and v == 'yingping-list-wrap':
                self.__start_p_yingping = True
            elif k == 'class' and v == 'item':
                self.__start_p_item = True
            elif k == 'class' and v == 'g-clear title-wrap':
                self.__start_p_gclear = True
            elif k == 'class' and v == 'rating-wrap g-clear':
                self.__start_p_ratingwrap = True
            elif k == 'class' and v == 'num':
                self.__start_p_num = True

    def end_p(self):
        if self.__start_p_yingping:
            if self.__start_p_item:
                if self.__start_p_gclear:
                    if self.__start_p_num or self.__start_p_ratingwrap:
                        if self.__start_p_num:
                            self.__start_p_num = False
                        if self.__start_p_ratingwrap:
                            self.__start_p_ratingwrap = False
                    else:
                        self.__start_p_gclear = False
                else:
                    self.data.append(self.__value)
                    self.__value = {}
                    self.__start_p_item = False
            else:
                self.__start_p_yingping = False

    def start_a(self, attrs):
        if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear:
            self.__start_a = True
            for k, v in attrs:
                if k == 'href':
                    self.__value['href'] = v

    def end_a(self):
        if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear and self.__start_a:
            self.__start_a = False

    def start_span(self, attrs):
        if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear:
            if self.__start_p_ratingwrap:
                if self.__span_state != 1:
                    for k, v in attrs:
                        if k == 'class' and v == 'rating':
                            self.__span_state = 1
                        elif k == 'class' and v == 'time':
                            self.__span_state = 2
                else:
                    for k, v in attrs:
                        if k == 'style':
                            score_text = re.search(r'\d+', v).group()
                    self.__value['score'] = int(score_text) / 20
                    self.__span_state = 3
            elif self.__start_p_num:
                self.__span_state = 4

    def end_span(self):
        self.__span_state = 0

    def handle_data(self, data):
        if self.__start_a:
            self.__value['title'] = data
        elif self.__span_state == 2:
            self.__value['time'] = data
        elif self.__span_state == 4:
            score_text = re.search(r'\d+', data).group()
            self.__value['people'] = int(score_text)
        pass
def sgl_parser(html):
    parser = CommentParaser()
    parser.feed(html)
    return parser.data

4,HTMLParaer，与3原理相识，就是调用的方法不太一样，基本上可以公用，

class CommentHTMLParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.__start_p_yingping = False
        self.__start_p_item = False
        self.__start_p_gclear = False
        self.__start_p_ratingwrap = False
        self.__start_p_num = False
        # a
        self.__start_a = False
        # span 3中状态
        self.__span_state = 0
        # 数据
        self.__value = {}
        self.data = []

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            for k, v in attrs:
                if k == 'class' and v == 'yingping-list-wrap':
                    self.__start_p_yingping = True
                elif k == 'class' and v == 'item':
                    self.__start_p_item = True
                elif k == 'class' and v == 'g-clear title-wrap':
                    self.__start_p_gclear = True
                elif k == 'class' and v == 'rating-wrap g-clear':
                    self.__start_p_ratingwrap = True
                elif k == 'class' and v == 'num':
                    self.__start_p_num = True
        elif tag == 'a':
            if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear:
                self.__start_a = True
                for k, v in attrs:
                    if k == 'href':
                        self.__value['href'] = v
        elif tag == 'span':
            if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear:
                if self.__start_p_ratingwrap:
                    if self.__span_state != 1:
                        for k, v in attrs:
                            if k == 'class' and v == 'rating':
                                self.__span_state = 1
                            elif k == 'class' and v == 'time':
                                self.__span_state = 2
                    else:
                        for k, v in attrs:
                            if k == 'style':
                                score_text = re.search(r'\d+', v).group()
                        self.__value['score'] = int(score_text) / 20
                        self.__span_state = 3
                elif self.__start_p_num:
                    self.__span_state = 4

    def handle_endtag(self, tag):
        if tag == 'p':
            if self.__start_p_yingping:
                if self.__start_p_item:
                    if self.__start_p_gclear:
                        if self.__start_p_num or self.__start_p_ratingwrap:
                            if self.__start_p_num:
                                self.__start_p_num = False
                            if self.__start_p_ratingwrap:
                                self.__start_p_ratingwrap = False
                        else:
                            self.__start_p_gclear = False
                    else:
                        self.data.append(self.__value)
                        self.__value = {}
                        self.__start_p_item = False
                else:
                    self.__start_p_yingping = False
        elif tag == 'a':
            if self.__start_p_yingping and self.__start_p_item and self.__start_p_gclear and self.__start_a:
                self.__start_a = False
        elif tag == 'span':
            self.__span_state = 0

    def handle_data(self, data):
        if self.__start_a:
            self.__value['title'] = data
        elif self.__span_state == 2:
            self.__value['time'] = data
        elif self.__span_state == 4:
            score_text = re.search(r'\d+', data).group()
            self.__value['people'] = int(score_text)
        pass
def html_parser(html):
    parser = CommentHTMLParser()
    parser.feed(html)
    return parser.data

3,4对于该案例来说确实是不太适合，趁现在有空记录下来，功学习使用！

上一篇： web前端性能优化的方法教程

下一篇：美最大运营商Verizon加入支持三星支付

Python解析html的几种操作方式

Python读写txt文本文件的操作方法全解析

python线程的几种创建方式详解

深入解析Python中的集合类型操作符

解析HTML5的存储功能和web SQL的相关操作方法

Python通过DOM和SAX方式解析XML的应用实例分享

Python中最常用的操作列表的几种方法归纳

对Python3 解析html的几种操作方式小结

详解前端HTML5几种存储方式的总结

Python 常用的几种安装Module方式

Python下利用BeautifulSoup解析HTML的实现