python爬虫基础项目--爬取股吧前十页【阅读】【评论】【标题】【作者】【更新时间】
程序员文章站
2022-03-02 19:38:19
...
这是一个爬取股吧前十页数据的爬虫
import re, json
import requests
def write_to_json(infos):
with open('movies.json', 'w', encoding='utf-8') as fp:
json.dump(infos, fp)
# 解析页面内容
def parse_page(html_str):
# 测试页面内容是否能拿到
# print(html_str)
# 正则筛选页面的原则:缩小匹配范围。
ul_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
ul_content = ul_p.search(html_str).group()
cite_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
cite_list = cite_p.findall(ul_content)
'''
阅读
评论
标题
作者
更新时间
详情页
'''
for cite in cite_list:
cite_q = re.compile(r'<li>(.*?)</li>', re.S)
cite_list2 = cite_q.findall(cite)
for cite2 in cite_list2:
clk_p = re.compile(r'<cite>(.*?)</cite>', re.S) ###阅读
clk = clk_p.findall(cite2)
#阅读数
read_count = clk[0].strip()
print(read_count)
#评论数
comment = clk[1].strip()
#标题
title_p = re.compile(r'.*?class="note">(.*?)</a>',re.S)
title = title_p.search(cite2).group(1)
aut_p = re.compile(r'.*?target="_blank"><font>(.*?)</font>') ###作者
aut = aut_p.search(cite2).group(1).strip()
last_p = re.compile(r'<cite class="last">(.*?)</cite>') ###更新时间
last = last_p.search(cite2).group(1)
url_p = re.compile(r'<a href="(.*?)"') ###详细地址
url = url_p.search(cite2).group(1)
# 设置爬取数据
item = {}
item['clk'] = read_count
item['rev'] = comment
item['sub'] = title
item['aut'] = aut
item['last'] = last
item['url'] = url
infos.append(item)
return infos
# 保存到json文件
def qingqiu():
for i in range(1,11):
# 确定基础url
base_url = f'http://guba.eastmoney.com/default,99_{i}.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
response = requests.get(base_url, headers=headers)
infos = parse_page(response.text)
return infos
if __name__ == '__main__':
# infos = []
# infos = qingqiu()#提取数据,将数据添加到infos中
# # 将数据写入文件
# write_to_json(infos)
infos = json.load(open('movies.json', 'r')) ###项目会自己创建一个movies
for item in infos:
print(item)
==========================================================================
爬取股吧前十页数据的优化版
import requests,re,json
#判空校验
def get_match(match,number):
if match:
return match.group(number)
return ''
def write_to_json(infos):
with open('guba.json','w',encoding='utf-8') as fp:
json.dump(infos,fp)
def parse_page(html_str):
# print(html_str)
ul_p = re.compile(r' <ul class="newlist"(.*?)</ul>',re.S)
ul_content = get_match(ul_p.search(html_str),0)
li_p = re.compile(r'<li>(.*?)</li>', re.S)
li_contents = li_p.findall(ul_content)
# print(li_content)
for li in li_contents:
#阅读
infos_p = re.compile(r'<cite>(.*?)</cite>',re.S)
infos = infos_p.findall(li)
if len(infos)==2:
read_num = infos[0].strip()
comment_num = infos[1].strip()
title_p = re.compile(r'" title="(.*?)" class=',re.S)
title = get_match(title_p.search(li),1)
# print(title)
href_p = re.compile(r' <a href="(.*?)" title="',re.S)
href = get_match(href_p.search(li),1)
# print(href)
href = 'http://guba.eastmoney.com'+href
item = {}
item['read_num'] =read_num
item['comment_num'] =comment_num
item['title'] =title
item['href'] =href
print(item)
infos.append(item)
def main():
base_url = 'http://guba.eastmoney.com/default,99_%s.html'
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
for page in range(1,13):
response = requests.get(base_url %page ,headers= headers)
parse_page(response.text)
if __name__ == '__main__':
infos = []
main()
write_to_json(infos)
上一篇: python爬取所有类型新闻包newspaper提取正文和标题
下一篇: 属性动画