python爬虫之Scrapy介绍(模拟登录)
程序员文章站
2023-11-23 09:47:22
本篇博文介绍的是如何实现用Scrapy实现登录,这里介绍两种响应,一种是get响应(人人网登录为示例),一种是post响应(GitHub为示例)。1 模拟登录人人网import scrapyclass RenrenSpider(scrapy.Spider): name = 'renren' allowed_domains = ['renren.com'] start_urls = ['http://www.renren.com/974676254/profile']...
本篇博文介绍的是如何实现用Scrapy实现登录,这里介绍两种响应,一种是get响应(人人网登录为示例),一种是post响应(GitHub为示例)。
1 模拟登录人人网
import scrapy
class RenrenSpider(scrapy.Spider):
name = 'renren'
allowed_domains = ['renren.com']
start_urls = ['http://www.renren.com/974676254/profile']
# 重写start_requests(self) 重新获取响应
def start_requests(self):
# 携带cookie 以字典形式传入
cookies = 'anonymid=kc1mze9joexboe; depovince=GW; _r01_=1; JSESSIONID=abc6wqeg9OcoteTZwoemx; ick_login=52775e71-3e24-4adf-8aff-3dbf92393776; taihe_bi_sdk_uid=c5ab893b13e43548f001c993d2154595; taihe_bi_sdk_session=433c3341b51a0ba03c62dccdaac91f19; loginfrom=null; jebe_key=5337f110-24a5-48d0-9393-8a1a4cc620c5%7Cbd5a1aba8e897ca35d9faabeb72dc675%7C1593503968723%7C1%7C1593503971085; jebe_key=5337f110-24a5-48d0-9393-8a1a4cc620c5%7Cbd5a1aba8e897ca35d9faabeb72dc675%7C1593503968723%7C1%7C1593503971087; wp_fold=0; t=3cd1a03529c28c3bcfb6373ce02efc554; societyguester=3cd1a03529c28c3bcfb6373ce02efc554; id=974676254; xnsid=faa094f8; jebecookies=0b991cb3-e2a3-449f-80f8-ba784b1b2a6c|||||; ver=7.0'
cookies = {i.split('=')[0] : i.split('=')[1] for i in cookies.split('; ')}
# 发送请求
yield scrapy.Request(
url=self.start_urls[0],
# 处理请求的结果
callback=self.parse,
# 携带cookie
cookies = cookies
)
def parse(self, response):
# print(response.body.decode())
#保存文件
with open('renren.html','w',encoding='utf-8') as f:
f.write(response.body.decode())
2 模拟登录GitHub
import scrapy
'''
commit: Sign in
authenticity_token: ay/QHPxeCTKwPlks4/0QoVvp2CttEF5NRJ/mimgV7xv7N+d1ONDn5IRbtNxCoG1JpdCASZ8Sw669MMmNm9GwFg==
ga_id: 287622012.1592305586
login: LogicJerry
password: 123456
webauthn-support: supported
webauthn-iuvpaa-support: supported
return_to:
required_field_2fbe:
timestamp: 1593524450720
timestamp_secret: 02ae98af552a04d667ca9ae3afb11bbb763332685c2b8cf12daeef6f9f26b22f
'''
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
# 提取表单数据(formdata)
def parse(self, response):
commit = 'Sign in'
authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
# ga_id = response.xpath("//input[@name='ga_id']/@value").extract_first()
login = 'LogicJerry'
password = '12122121zxl'
timestamp = response.xpath("//input[@name='timestamp']/@value").extract_first()
timestamp_secret = response.xpath("//input[@name='timestamp_secret']/@value").extract_first()
# print(authenticity_token)
# 定义一个字典来提交表单数据(formdata)
data = {
'commit':commit,
'authenticity_token': authenticity_token,
# 'ga_id': ga_id,
'login': login,
'password': password,
'webauthn-support': 'supported',
'webauthn-iuvpaa-support': 'unsupported',
'timestamp': timestamp,
'timestamp_secret': timestamp_secret,
}
yield scrapy.FormRequest(
# 提交的地址
url='https://github.com/session',
# 提交的数据
formdata=data,
# 响应的方法
callback=self.after_login,
)
def after_login(self,response):
# 保存页面
with open('github.html','w',encoding='utf-8') as f:
f.write(response.body.decode())
print(response)
另外一种更加简洁的post响应,提交数据!!以yield scrapy.FormRequest.from_response 方式提交表单数据
import scrapy
class Github2Spider(scrapy.Spider):
name = 'github2'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
yield scrapy.FormRequest.from_response( # 这是用scrapy.FormRequest.from_response,第一种是scrapy.FormRequest
# 请求的响应结果
response= response,
# 提交数据
# 区别第一种方式:定义data所有内容,然后再提交
formdata={'login':'LogicJerry','password':'12122121zxl'},
# 回调函数
callback = self.after_login
)
def after_login(self,response):
# 保存文件
with open('github2.html','w',encoding='utf-8') as f:
f.write(response.body.decode())
# pass
本文地址:https://blog.csdn.net/Claire_chen_jia/article/details/107068402