Python 爬虫:urllib.request
程序员文章站
2022-05-03 21:29:13
...
urllib.request.urlopen(url) :网络请求
urllib.parse.quote(url, safe=string.printable):将字符串转化成python解释器可以看懂的ascii形式
urllib.parse.urlencode(dict) :将字典转化成URL中的参数形式
import urllib.request
def load():
# 1 url地址
url = 'http://www.baidu.com'
# 2 get请求
response = urllib.request.urlopen(url)
print(response)
#读取内容
data = response.read()
print(data)
#将获取的内容转换成字符串
str_data = data.decode('utf-8')
# 数据写入文件
with open('baidu.html','w',encoding='utf-8') as f:
f.write(str_data)
#如果爬取回来的数据是str,但是写入本地的数据是bytes类型
#将字符串类型转换成bytes类型
str_name = 'sfj'
byte_name = str_name.encode('utf-8')
print(byte_name)
load()
import urllib.request
import urllib.parse
import string
def get_method_params():
url = 'http://www.baidu.com/s?wd='
#拼接参数
name = '帅哥'
final_url = url+name
print(final_url)
#将包含汉字的网址进行转译,让python解释器看懂
endcode_new_url = urllib.parse.quote(final_url,safe = string.printable)
response = urllib.request.urlopen(endcode_new_url)
data = response.read().decode('utf-8')
print(data)
#python是解释性语言,解析器只支持ascii
with open('02-baidu.html','w',encoding='utf-8') as f:
f.write(data)
get_method_params()
import urllib.request
import urllib.parse
import string
def get_params():
url = 'http://www.baidu.com/s?'
param ={
'wd':'中文',
'key':'zhang',
'value':'san'
}
#将字典转换成参数形式的字符串
# result = str(param)
# print(result)
#将字典转化成参数形式,并将url转换成计算机可以识别的状态(python解释器可以看懂)
str_params = urllib.parse.urlencode(param)
print(str_params)
final_url = url+str_params
#下一行代码的功能已实现,可以不写
#将带有中文的url转换成可以识别的状态
end_url = urllib.parse.quote(final_url,safe=string.printable)
response = urllib.request.urlopen(end_url)
data = response.read().decode('utf-8')
print(data)
with open('03-get_dict_params.html','w',encoding='utf-8') as f:
f.write(data)
get_params()