2021-07-18 爬虫实战,爬取猫眼榜单TOP100
“”"
猫眼榜单爬取
简单项目 request爬虫 分页爬取
爬取榜单T100,写进文件
https://maoyan.com/board
爬取信息:{排名,图片,标题,主演,放映时间,评分}
分析:
第一页:https://maoyan.com/board/4?offset=0
第二页:https://maoyan.com/board/4?offset=10
…
第十页:https://maoyan.com/board/4?offset=90
90=10-1+“0”
请求方式: GET
参数:offset
返回值:HTML代码
“”"
import requests
from requests.exceptions import RequestException
import re,time,json
def getPage(url):
‘’‘爬取指定 URL地址对应的页面信息’’’
try: #异常处理
# 封装头信息
‘’’
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
‘’’
headers = {
“Accept”: “application/json, text/javascript, /; q=0.01”,
“X-Requested-With”: “XMLHttpRequest”,
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”,
“Content-Type”: “application/x-www-form-urlencoded; charset=UTF-8”,
“Cookie”:"__mta=156078980.1626568448129.1626573809055.1626573813982.17; uuid_n_v=v1; uuid=C83359D0E75F11EB82B96B700774E9B80C46EFC3C93C44F4BEF0D1A78535F054; _csrf=48b61658bb663dcdd5f31b91ca02b255aebb124b93d280b805de7cda35590ec8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1626568414; _lxsdk_cuid=17ab7085558c8-0708c9630dded1-3c604504-144000-17ab7085558c8; _lxsdk=C83359D0E75F11EB82B96B700774E9B80C46EFC3C93C44F4BEF0D1A78535F054; __utma=17099173.481871876.1626568512.1626568512.1626568512.1; __utmc=17099173; __utmz=17099173.1626568512.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1626573896; __mta=156078980.1626568448129.1626573813982.1626573896465.18; _lxsdk_s=17ab73be370-777-674-839%7C%7C36"
}
# 用户代理,使得服务器能够识别客户使用的操作系统及版本
agentsList = [
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36”,
“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0”
]
res = requests.get(url, headers=headers)
#如果有验证码就打出来真实地址
#print(res.url)
if res.status_code==200:
res.encoding = "utf-8"
#print(res.text)
return res.text
else:
return None
except RequestException:
return None
def parsePage(html):
‘’‘解析网页爬取内容,并且返回字段结果’’’
#print(html)
pat = ‘([0-9]+).?’
'.?’
‘
(.?)
.?’‘
(.?)
.?’‘
([0-9.]+)([0-9])
’#([0-9]+) 0-9的数字 +表示 位数更多 其中.*?包含的是二者之间的内容 即不需要分析的内容
items = re.findall(pat,html,re.S)
#print(items)
for item in items:
# print(item)
#拼装成字典
yield{
“index”: item[0],
“image”: item[1],
“title”: item[2],
“actor”: item[3].strip()[3:],#字符串处理 把 “\n 主演:吕艳婷,囧森瑟夫,瀚墨\n ”改为“主演:吕艳婷,囧森瑟夫,瀚墨” “主演:” 三个字符
“time”: item[4].strip()[5:],#分割之后切片
“score”: item[5]+item[6],
}
#print(dd)
def writeFile(content):
‘’‘执行信息内容存储’’’
#print(content)
with open("./result.txt",“a”,encoding=“utf-8”) as f:
f.write(json.dumps(content,ensure_ascii=False)+"\n")
#json.dump 序列化是对中文使用ascii编码,输出真正中文需要指定ensure_ascii=False
def main(offset):
‘’‘爬虫主函数(调度器函数)’’’
url = “https://maoyan.com/board/4?offset=” + str(offset)
print(url)
html=getPage(url)
if html:
for item in parsePage(html):
writeFile(item)
#程序主入口
if name==‘main’:
#main(0)
for i in range(10):
print(“爬取第”,i+1,“页信息。。。”)
main(offset=i*10)
time.sleep(1)#每秒爬取1页
上一篇: 关于python的多线程问题
下一篇: 爬取猫眼电影榜单