python+requests+re匹配抓取猫眼上映电影信息
程序员文章站
2022-04-10 22:02:41
python+requests抓取猫眼中上映电影,re正则匹配获取对应电影的排名,图片地址,片名,主演及上映时间和评分 ......
python+requests抓取猫眼中上映电影,re正则匹配获取对应电影的排名,图片地址,片名,主演及上映时间和评分
import requests import re, json def get_html(url): """ 获取网页html源码 :return: """ user_agent = "mozilla/5.0 (windows nt 10.0; win64; x64) " \ "applewebkit/537.36 (khtml, like gecko) chrome/74.0.3729.169 safari/537.36" # 浏览器信息 headers = { "user-agent": user_agent } r = requests.get(url, headers=headers) html = r.text # print(html) return html def parse_one_page(html): """ 正则匹配需要内容 :param html: :return: """ # 排名+图片地址+主演+上映时间+评分 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.s) items = re.findall(pattern, html) for item in items: yield { "排名": item[0], "图片地址": item[1], "片名": item[2], "主演": item[3].strip()[3:], "上映时间": item[4].strip()[4:], "分数": item[5] + item[6] } # 数据存储 def write_file(content): with open("result.txt", 'a+', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=false) + "\n") def main(): """ 主函数 :return: """ url = "http://maoyan.com/board/4" html = get_html(url) for item in parse_one_page(html): print(item) write_file(item) if __name__ == '__main__': main()
上一篇: netty框架概述