Requests库的使用
程序员文章站
2024-03-23 08:53:04
...
Requests库是python最简单易用的HTTP库。
get 请求
requests.get(url, params=None, **kwargs),返回 Response 对象;params参数为字典或字节序列,作为参数增加到url中;kwargs常用参数如下:
- headers(dict):字典,HTTP请求头
- cookies(dict|CookieJar):字典或CookieJar对象
- timeout(sec):设置链接的超时时间,以秒为单位
- proxies(dict): 设置代理
- verify(bool):是否需要证书验证,默认为True。如果抛出SSLError,可以设置为Flase
- allow_redirects(bool):重定向开关。默认为True
post 请求
requests.post(url, data=None, json=None, **kwargs),返回 Response 对象;参数如下:
- data: 字典、列表、元组、字节、文件对象作为参数
- files:文件对象作为参数
- json:json数据作为参数
- kwargs:同上
Cookie 和 Session
HTTP本身是一个无状态的连接协议,为了支持客户端与服务器之间的交互,我们就需要通过不同的技术为交互存储状态,而这些不同的技术就是Cookie和Session了。
Cookie是通过客户端来保持状态的,而Session是通过服务器来保持状态的。
例子:
import requests
"""不在一个会话中"""
requests.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
res = requests.get("http://httpbin.org/cookies")
print(res.text)
"""会话中"""
s = requests.Session()
s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
r = s.get("http://httpbin.org/cookies")
print(r.text)
Response
Response 有哪些常用的属性和方法?
import requests
res = requests.get('http://www.baidu.com')
"""响应内容的字符串形式"""
print(res.text)
# >>> <class 'str'>
"""响应内容的二进制形式"""
print(res.content)
# >>> <class 'bytes'>
"""获取cookie"""
print(res.cookies)
# >>> <class 'requests.cookies.RequestsCookieJar'>
"""CookieJar对象转字典"""
print(res.cookies.get_dict())
# >>> {'BDORZ': '27315'}
"""获取状态码"""
print(res.status_code)
# >>> 200
"""获取请求url"""
print(res.url)
# >>> http://www.baidu.com/
"""获取请求头"""
print(res.request.headers)
"""获取响应头"""
print(res.headers)
"""响应的默认编码,由响应头分析出来的"""
print(res.encoding)
# >>> 如果headers中不存在charset字段,默认编码为ISO-8859-1
"""由响应内容分析出来的的编码"""
print(res.apparent_encoding)
# >>> utf-8
"""json解析,如果响应内容为json对象的话"""
# print(res.json())
Requests库的使用
(1) 随机生成user-agent,用于伪装浏览器
例子:
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.random)
print(ua.chrome)
print(ua.ie)
print(ua.firefox)
print(ua.opera)
(2) 异常处理,基本代码框架
例子:
import requests
from fake_useragent import UserAgent
def crawler(url):
headers = {'user-agent': UserAgent().chrome}
try:
r = requests.get(url, headers=headers, timeout=8)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return None
if __name__ == '__main__':
url = 'https://www.baidu.com'
html = crawler(url)
if html:
print(html)
else:
print('产生异常!')
(3) 请求失败后,重链3次
例子:
import requests
def crawler(url):
n = 0
while n < 4:
try:
r = requests.get(url, timeout=8)
r.raise_for_status()
r.encoding = r.apparent_encoding
print('第%d次请求成功!' % n)
return r.text
except:
print('第%d次请求失败!' % n)
n += 1
if __name__ == '__main__':
url = 'https://www.123456789.com'
if crawler(url):
print('success!')
else:
print('failure!')
(4) 代理 IP
列举几个做代理 IP 的网址:
66代理:
http://www.66ip.cn/6.html
西刺代理:
https://www.xicidaili.com/
快代理:
https://www.kuaidaili.com/free/
检测代理 IP 是否可用:通过访问 https://httpbin.org/ip 来排查
下面爬取西刺免费高匿代理的例子:
from gevent import monkey
monkey.patch_all()
from fake_useragent import UserAgent
import gevent
import requests
import re
import json
import os
def crawler(url):
"""爬取页面"""
headers = {'user-agent': UserAgent().chrome}
try:
r = requests.get(url, headers=headers, timeout=8)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return None
def parse(html):
"""解析页面,提取ip"""
pattern = r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*?(\d{2,5}).*?高匿.*?(HTTPS?)'
tds = re.findall(pattern, html, re.S)
for td in tds:
k = 'http' if td[2] == 'HTTP' else 'https'
v = '{}:{}:{}'.format(k, td[0], td[1])
yield {k: v}
def check(ips):
"""检测ip是否可用"""
for ip in ips:
try:
r = requests.get('https://httpbin.org/ip', proxies=ip, timeout=3)
r.raise_for_status()
if '119.129' not in r.text:
print(ip, '可用')
with open('ip.txt', 'a') as f:
f.write(json.dumps(ip)+'\n')
else:
pass
except:
pass
def run(url):
"""爬取页面"""
html = crawler(url)
if html:
"""解析页面,提取ip"""
ips = parse(html)
"""检测ip是否可用"""
check(ips)
if __name__ == '__main__':
"""判断文件是否存在,存在删除"""
if os.path.exists('ip.txt'):
os.remove('ip.txt')
"""要爬取的页面"""
urls = [
'https://www.xicidaili.com/nn/', 'https://www.xicidaili.com/wn/',
'https://www.xicidaili.com/wt/',
]
url_list = [url + str(i + 1) for url in urls for i in range(3)]
"""创建爬虫任务"""
tasks = [gevent.spawn(run, url) for url in url_list]
"""执行爬虫任务"""
gevent.joinall(tasks)
值得注意的是,免费的代理ip不稳定,存活率低。。。
查询 IP 地址的例子:
import requests
import re
def check_ip(proxies):
"""先检查ip是否可用,再查询ip地址"""
try:
r = requests.get("https://httpbin.org/ip", proxies=proxies, timeout=4)
origin = r.json()["origin"].split(',')[0]
url = 'http://www.ip138.com/ips138.asp?ip=' + origin
headers = {'User-Agent': 'Mozilla/5.0'}
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
result = re.findall(r'<li>本站数据:(.*?)</li>', res.text, re.S)[0]
print(result)
except:
print('ip连接失败!请重试,或更换其它ip')
if __name__ == '__main__':
proxies = {'https': 'https:123.163.122.5:9999'}
check_ip(proxies)
使用代理 IP 的例子:
import requests
import json
import random
def read_ip():
"""读取ip"""
f = open('ip.txt', 'r')
lines = f.readlines()
f.close()
ip_list = []
for line in lines:
ip = line.strip()
ip_list.append(json.loads(ip))
return ip_list
def crawler(ips):
"""检测代理ip并使用"""
while True:
ip = random.choice(ips) # 随机ip
try:
r = requests.get('https://httpbin.org/ip', proxies=ip, timeout=3) # 检测ip
r.raise_for_status()
if '119.129' not in r.text:
res = requests.get('https://httpbin.org/ip', proxies=ip, timeout=4) # 这里可以替换访问的url
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.text
else:
pass
except:
pass
if __name__ == "__main__":
"""读取ip"""
ips = read_ip()
"""检测代理ip并使用"""
html = crawler(ips)
if html:
print(html)
最全最好的资料还是官方文档:
上一篇: Swift脚本使用方式