欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

2021-07-18 爬虫实战,爬取猫眼榜单TOP100

程序员文章站 2022-03-02 19:25:01
...

“”"
猫眼榜单爬取
简单项目 request爬虫 分页爬取
爬取榜单T100,写进文件
https://maoyan.com/board
爬取信息:{排名,图片,标题,主演,放映时间,评分}
分析:
第一页:https://maoyan.com/board/4?offset=0
第二页:https://maoyan.com/board/4?offset=10

第十页:https://maoyan.com/board/4?offset=90
90=10-1+“0”
请求方式: GET
参数:offset
返回值:HTML代码
“”"
import requests
from requests.exceptions import RequestException
import re,time,json

def getPage(url):
‘’‘爬取指定 URL地址对应的页面信息’’’
try: #异常处理
# 封装头信息
‘’’
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
‘’’
headers = {
“Accept”: “application/json, text/javascript, /; q=0.01”,
“X-Requested-With”: “XMLHttpRequest”,
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”,
“Content-Type”: “application/x-www-form-urlencoded; charset=UTF-8”,
“Cookie”:"__mta=156078980.1626568448129.1626573809055.1626573813982.17; uuid_n_v=v1; uuid=C83359D0E75F11EB82B96B700774E9B80C46EFC3C93C44F4BEF0D1A78535F054; _csrf=48b61658bb663dcdd5f31b91ca02b255aebb124b93d280b805de7cda35590ec8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1626568414; _lxsdk_cuid=17ab7085558c8-0708c9630dded1-3c604504-144000-17ab7085558c8; _lxsdk=C83359D0E75F11EB82B96B700774E9B80C46EFC3C93C44F4BEF0D1A78535F054; __utma=17099173.481871876.1626568512.1626568512.1626568512.1; __utmc=17099173; __utmz=17099173.1626568512.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1626573896; __mta=156078980.1626568448129.1626573813982.1626573896465.18; _lxsdk_s=17ab73be370-777-674-839%7C%7C36"
}
# 用户代理,使得服务器能够识别客户使用的操作系统及版本
agentsList = [
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36”,
“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0”
]
res = requests.get(url, headers=headers)

    #如果有验证码就打出来真实地址
    #print(res.url)
    if res.status_code==200:
        res.encoding = "utf-8"
        #print(res.text)
        return res.text
    else:
        return None
except RequestException:
    return None

def parsePage(html):
‘’‘解析网页爬取内容,并且返回字段结果’’’
#print(html)
pat = ‘([0-9]+).?’
'2021-07-18 爬虫实战,爬取猫眼榜单TOP100.
?’

(.?)

.?’

(.?)

.?’

([0-9.]+)([0-9])


#([0-9]+) 0-9的数字 +表示 位数更多 其中.*?包含的是二者之间的内容 即不需要分析的内容
items = re.findall(pat,html,re.S)
#print(items)
for item in items:
# print(item)
#拼装成字典
yield{
“index”: item[0],
“image”: item[1],
“title”: item[2],
“actor”: item[3].strip()[3:],#字符串处理 把 “\n 主演:吕艳婷,囧森瑟夫,瀚墨\n ”改为“主演:吕艳婷,囧森瑟夫,瀚墨” “主演:” 三个字符
“time”: item[4].strip()[5:],#分割之后切片
“score”: item[5]+item[6],
}
#print(dd)

def writeFile(content):
‘’‘执行信息内容存储’’’
#print(content)
with open("./result.txt",“a”,encoding=“utf-8”) as f:
f.write(json.dumps(content,ensure_ascii=False)+"\n")
#json.dump 序列化是对中文使用ascii编码,输出真正中文需要指定ensure_ascii=False

def main(offset):
‘’‘爬虫主函数(调度器函数)’’’
url = “https://maoyan.com/board/4?offset=” + str(offset)
print(url)
html=getPage(url)
if html:
for item in parsePage(html):
writeFile(item)

#程序主入口
if name==‘main’:
#main(0)
for i in range(10):
print(“爬取第”,i+1,“页信息。。。”)
main(offset=i*10)
time.sleep(1)#每秒爬取1页