Python爬虫-requests库
程序员文章站
2022-07-14 11:19:08
...
记录一下学习过的爬虫知识,方便以后回顾查找~
import requests
import re
一、Get请求
r=requests.get('http://baidu.com/') #获取百度网址
print(type(r)) #类型
print(r.status_code) #状态码
print(type(r.text))# 响应体的类型
print(r.text) #内容
print(r.cookies) #cookies
1.1 添加params参数
data={
'name':'germey',
'age':22
}
r=requests.get('http://httpbin.org/get?',params=data)
#请求链接自动变为http://httpbin.org/get?name=germey&age=22'
print(r.text)
- json 格式转化为字典
r=requests.get('http://httpbin.org/get')
print(type(r.text))
print("\n")
print(r.json()) #将JSON格式的字符串转化为字典
print(type(r.json()))
1.2 抓取网页
headers={
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
}
r=requests.get('https://www.zhihu.com/explore',headers=headers)
pattern=re.compile('explore-feed.*') #正则表达式
title=re.findall(pattern,r.text)
title
1.3 获取二进制数据
r=requests.get('https://github.com/favicon.ico')
print(r.text)
print(r.content)
结果显示不了
- 获取Github图标
r=requests.get('https://github.com/favicon.ico')
with open('favicon.ico','wb') as f:
f.write(r.content) #获取Github图标
本地文件夹出现github图标照片
二、post请求
data={'name':'germey','age':22}
r=requests.post('http://httpbin.org/post',data=data) #data参数就是params
print(r.text) #结果中form就是提交的data
三、响应
headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'}
r=requests.get('http://www.jianshu.com',headers=headers)
print(type(r.status_code),r.status_code)
print(type(r.headers),r.headers)
print(type(r.cookies),r.cookies)
print(type(r.url),r.url)
print(type(r.history),r.history)
四、高级用法
4.1 cookie
r=requests.get("https://www.baidu.com")
print(r.cookies)
print(r.cookies.items())
for key,value in r.cookies.items():
print(key + "=" + value)
- 获取知乎用自己的cookie
headers={'Cookie': '_zap=698024e1-91ad-4ef1-b497-8faa3456af84; _xsrf=Nk1tdLqXQKJcprow5wl84Uf9hF8eRJQs; d_c0="AOAvjwIetA-PTgv6gt-7-7DXekoQvE6Llbs=|1562565500"; __utmz=51854390.1578618753.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.110--|2=registration_date=20160502=1^3=entry_date=20160502=1; _ga=GA1.2.1987250188.1578618753; capsion_ticket="2|1:0|10:1583373076|14:capsion_ticket|44:NDBmNGYxNjJiYzFiNGNjNDljZmU5ZDNlN2M2M2E0OGI=|ef52da7f694e2d959b574ccf8770642b11a2c3cc15a995c4900eb6f90d85060a"; z_c0="2|1:0|10:1583373097|4:z_c0|92:Mi4xYVVqM0FnQUFBQUFBNEMtUEFoNjBEeWNBQUFDRUFsVk5LZWlIWGdEQl9VaWItMWstSm1VRG9ZejV0Q3F4cjBJVDln|ce6d3024eb77f9fb8e03cf68d8f2094ce84f905569bdabcc365892fac8ef0319"; tst=r; _gid=GA1.2.143501728.1584538440; q_c1=886762f2bae04daca68b9ae2af9a4e38|1584538854000|1564363133000; __utma=51854390.1987250188.1578618753.1578618753.1584538698.2; __utmb=51854390.0.10.1584538698; __utmc=51854390; _gat_gtag_UA_149949619_1=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1583761483,1584325469,1584538441,1584540435; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584540435; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1584540595|1584538597',
'Host':'www.zhihu.com',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
}
r=requests.get('https://www.zhihu.com',headers=headers)
print(r.text)
cookies='_zap=698024e1-91ad-4ef1-b497-8faa3456af84; _xsrf=Nk1tdLqXQKJcprow5wl84Uf9hF8eRJQs; d_c0="AOAvjwIetA-PTgv6gt-7-7DXekoQvE6Llbs=|1562565500"; __utmz=51854390.1578618753.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.110--|2=registration_date=20160502=1^3=entry_date=20160502=1; _ga=GA1.2.1987250188.1578618753; capsion_ticket="2|1:0|10:1583373076|14:capsion_ticket|44:NDBmNGYxNjJiYzFiNGNjNDljZmU5ZDNlN2M2M2E0OGI=|ef52da7f694e2d959b574ccf8770642b11a2c3cc15a995c4900eb6f90d85060a"; z_c0="2|1:0|10:1583373097|4:z_c0|92:Mi4xYVVqM0FnQUFBQUFBNEMtUEFoNjBEeWNBQUFDRUFsVk5LZWlIWGdEQl9VaWItMWstSm1VRG9ZejV0Q3F4cjBJVDln|ce6d3024eb77f9fb8e03cf68d8f2094ce84f905569bdabcc365892fac8ef0319"; tst=r; _gid=GA1.2.143501728.1584538440; q_c1=886762f2bae04daca68b9ae2af9a4e38|1584538854000|1564363133000; __utma=51854390.1987250188.1578618753.1578618753.1584538698.2; __utmc=51854390; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1584325469,1584538441,1584540435,1584540857; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1584540857; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1584541061|1584538597'
jar=requests.cookies.RequestCookieJar()
headers={'Host':'www.zhihu.com',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36'
}
for cookie in cookies.split(';'):
key,value=cookie.split('=',1) #1表示分割成1+1=2个
jar.set(key,value)
r=requests.get('https://www.zhihu.com',cookie=jar,headers=headers)
print(r.text)
4.2 会话维持(Session)
- 没有设置会话维持
requests.get('http://httpbin.org/cookies/set/number/123456789')
r=requests.get('http://httpbin.org/cookies')
print(r.text)
- 设置会话
s=requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
r=s.get('http://httpbin.org/cookies')
print(r.text)
推荐阅读
-
Python爬虫使用selenium爬取qq群的成员信息(全自动实现自动登陆)
-
【Python必学】Python爬虫反爬策略你肯定不会吧?
-
python爬虫系列:三、URLError异常处理
-
用于业余项目的8个优秀Python库
-
python爬虫之自动登录与验证码识别
-
Ubuntu18.04一次性升级Python所有库的方法步骤
-
Python17之函数、类、模块、包、库
-
Python_WIN10系统中递归所有文件夹所有文件_移动所有文件到主目录(使用到的库:os + glob + shutil)
-
Python实现mysql数据库更新表数据接口的功能
-
Python基于Hypothesis测试库生成测试数据