爬虫基础——urllib库
程序员文章站
2022-05-03 19:59:32
...
# 使用 urllib
# 导入必要模块
from urllib import request
# 如果需要 URL转码
from urllib import parse
print(parse.quote('范冰冰'))
%E8%8C%83%E5%86%B0%E5%86%B0
# urlopen
url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E8%8C%83%E5%86%B0%E5%86%B0'
rsp = request.urlopen(url)
print(type(rsp))# HTTPResponse对象
html = rsp.read()# read()方法获取全部字节
print(type(html))
#print(html)
<class 'http.client.HTTPResponse'> <class 'bytes'>
#解码 得到字符串
doc = html.decode()
print(type(doc))
#print(doc)
<class 'str'>
# chardet
# conda install chardet
import chardet
ct = chardet.detect(html)
print(ct)
# {'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
# 猜测编码格式为 utf-8, 信心度99%, 未检测到语言类型
{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
# 按检测结果进行解码
doc = html.decode(ct.get('encoding','utf-8')) #健壮性写法,默认值
#print(doc)
# response 简介
print(type(rsp))
<class 'http.client.HTTPResponse'>
# 获取HTTP响应代码
# 200 OK
# 404 找不到页面
# 403 禁止访问
# ...
rspo = request.urlopen('http://www.laochu.net/')
print(rspo.getcode())
# HTTP两种访问方式 get post
# get方式url提交参数给服务器传递信息,获取服务器响应
url = 'http://www.baidu.com/s?' #百度搜索网址
kw = input('请输入关键词:') #让用户输入搜索关键词
pr = {"wd":kw} #放到参数字典中
cpr = parse.urlencode(pr)#转码方式
print(cpr)
full_url = url + cpr #拼合完整URL
print(full_url)
rsp = request.urlopen(full_url)#发起GET请求并得到响应
html = rsp.read()#获取返回流内容
doc = html.decode()#转码得到字符串
#print(doc)
# post方式
# 使用 urlopen 的 data 参数
# 更改请求头信息
# content-type:application/x-www.form-urlencode
# content-length:数据长度
# 使用urllib.parse.urlencode
# 百度翻译
base_url = 'http://fanyi.baidu.com/sug'
# 准备POST参数
kw = input('请输入英文:')
data ={
'kw':kw
}
# URL编码
req_data = parse.urlencode(data).encode()#结果字符串编码成字节
# 发起访问,获得响应
rsp = request.urlopen(base_url,data=req_data)
# 获取返回文本内容
html = rsp.read().decode()
print(html)
# 反序列化json数据
import json
json_data = json.loads(html)
print(json_data)
#解析结果并显示
results = json_data['data']
for s in results:
print(s['k'],'',s['v'])
# json pickle
# 支持python 序列化,反序列化数据格式
import json,pickle
d = (123,'Abc')
a = {'label1':'Hello,Python','label2':d}
print(a)
#dump() dumps()
print(json.dumps(a))
s ='{"label1": "Hello,Python", "label2": [123, "Abc"]}'
e = json.loads(s)
print(type(e))
print(e.keys())
for label in e.keys():
print(e[label])
print(pickle.dumps(a))
sr =b'\x80\x03}q\x00(X\x06\x00\x00\x00label1q\x01X\x0c\x00\x00\x00Hello,Pythonq\x02X\x06\x00\x00\x00label2q\x03K{X\x03\x00\x00\x00Abcq\x04\x86q\x05u.'
print(type(sr))
f = pickle.loads(sr)
print(type(f))
print(f)
# 只通过urlopen函数已经不能满足复杂的请求要求
# 复杂的请求,通过header设置更多信息
# 通过 request.Request类构造一个复杂请求对象
# 基本地址
base_url = 'http://fanyi.baidu.com/sug'
# 准备POST参数
data ={
'kw':'python'
}
# URL编码
req_data = parse.urlencode(data).encode()#结果字符串编码成字节
# header
header = {
'Content-Length': len(req_data),
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
# 构造请求实例,发起访问,获得响应
req = request.Request(url=base_url,data=req_data,headers=header)
rsp = request.urlopen(req)
# 获取返回文本内容
html = rsp.read().decode()
print(html)
for i in [(s['k'],s['v']) for s in json.loads(html)['data']]:
print(*i)
import random as rnd
h = [(s,n) for n in range(13) for s in range(4)]
print(h)
rnd.shuffle(h)
print(h)
# error
from urllib.error import *
url='http://www.wwwlaochu.net/m.html'
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except HTTPError as e:
print('HTTPERROR:')
print(e)
except URLError as e:
print('URLERROR:')
print(e)
# UA, User Agent 用户身份描述
# 描述浏览器和设备信息
url = 'http://www.baidu.com'
# 模拟安卓设备用mozilla浏览器访问百度首页
headers = {
'User-Agent':'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0'
}
req = request.Request(url,headers=header)
rsp = request.urlopen(req)
html = rsp.read().decode()
#print(html)
# ajax
# 豆瓣电影 movie.douban.com
page = 1
limit = 20
start = page*limit
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit={0}&page_start={1}'.format(limit,start)
print(url)
rsp = request.urlopen(url)
htm = rsp.read().decode()
#print(htm)
json_data = json.loads(htm)
for m in json_data['subjects']:
print(m['title'],m['cover_x'],m['rate'])