欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫基础项目--爬取股吧前十页【阅读】【评论】【标题】【作者】【更新时间】

程序员文章站 2022-03-02 19:38:19
...

这是一个爬取股吧前十页数据的爬虫

import re, json
import requests


def write_to_json(infos):
    with open('movies.json', 'w', encoding='utf-8') as fp:
        json.dump(infos, fp)


# 解析页面内容
def parse_page(html_str):

    # 测试页面内容是否能拿到
    # print(html_str)
    # 正则筛选页面的原则:缩小匹配范围。
    ul_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
    ul_content = ul_p.search(html_str).group()
    cite_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
    cite_list = cite_p.findall(ul_content)

    '''
    阅读
    评论
    标题
    作者
    更新时间
    详情页

    '''

    for cite in cite_list:
        cite_q = re.compile(r'<li>(.*?)</li>', re.S)
        cite_list2 = cite_q.findall(cite)
        for cite2 in cite_list2:
            clk_p = re.compile(r'<cite>(.*?)</cite>', re.S)  ###阅读
            clk = clk_p.findall(cite2)
            #阅读数
            read_count = clk[0].strip()
            print(read_count)
            #评论数
            comment = clk[1].strip()
            #标题
            title_p = re.compile(r'.*?class="note">(.*?)</a>',re.S)
            title = title_p.search(cite2).group(1)
            aut_p = re.compile(r'.*?target="_blank"><font>(.*?)</font>')  ###作者
            aut = aut_p.search(cite2).group(1).strip()
            last_p = re.compile(r'<cite class="last">(.*?)</cite>')  ###更新时间
            last = last_p.search(cite2).group(1)
            url_p = re.compile(r'<a href="(.*?)"')  ###详细地址
            url = url_p.search(cite2).group(1)
            # 设置爬取数据
            item = {}
            item['clk'] = read_count
            item['rev'] = comment
            item['sub'] = title
            item['aut'] = aut
            item['last'] = last
            item['url'] = url
            infos.append(item)

    return infos
        # 保存到json文件

def qingqiu():
    for i in range(1,11):
        # 确定基础url
        base_url = f'http://guba.eastmoney.com/default,99_{i}.html'
        headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',

        }

    response = requests.get(base_url, headers=headers)
    infos = parse_page(response.text)
    return infos

if __name__ == '__main__':
    # infos = []
    # infos = qingqiu()#提取数据,将数据添加到infos中
    # # 将数据写入文件
    # write_to_json(infos)

    infos = json.load(open('movies.json', 'r'))  ###项目会自己创建一个movies
    for item in infos:
        print(item)

==========================================================================
爬取股吧前十页数据的优化版

import requests,re,json
#判空校验
def get_match(match,number):
    if match:
        return match.group(number)
    return ''
def write_to_json(infos):
    with open('guba.json','w',encoding='utf-8') as fp:
        json.dump(infos,fp)

def parse_page(html_str):
    # print(html_str)
    ul_p = re.compile(r' <ul class="newlist"(.*?)</ul>',re.S)
    ul_content = get_match(ul_p.search(html_str),0)
    li_p = re.compile(r'<li>(.*?)</li>', re.S)
    li_contents = li_p.findall(ul_content)
    # print(li_content)
    for li in li_contents:
        #阅读
        infos_p = re.compile(r'<cite>(.*?)</cite>',re.S)
        infos = infos_p.findall(li)
        if len(infos)==2:
            read_num = infos[0].strip()
            comment_num = infos[1].strip()
            title_p = re.compile(r'" title="(.*?)" class=',re.S)
            title = get_match(title_p.search(li),1)
            # print(title)
            href_p = re.compile(r' <a href="(.*?)" title="',re.S)
            href = get_match(href_p.search(li),1)
            # print(href)
            href = 'http://guba.eastmoney.com'+href
            item = {}
            item['read_num'] =read_num
            item['comment_num'] =comment_num
            item['title'] =title
            item['href'] =href
            print(item)
            infos.append(item)

def main():
    base_url = 'http://guba.eastmoney.com/default,99_%s.html'
    headers= {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    }
    for page in range(1,13):
        response = requests.get(base_url %page ,headers= headers)
        parse_page(response.text)

if __name__ == '__main__':
    infos = []
    main()
    write_to_json(infos)

相关标签: 爬虫