Python爬虫之Urllib库的基本使用
程序员文章站
2022-06-24 14:14:42
urllib提供的功能就是利用程序去执行各种HTTP请求。如果要模拟浏览器完成特定功能,需要把请求伪装成浏览器。伪装的方法是先监控浏览器发出的请求,再根据浏览器的请求头来伪装,User-Agent头就是用来标识浏览器的。 ......
# get请求 import urllib.request response = urllib.request.urlopen("http://www.baidu.com") print(response.read().decode('utf-8')) # post请求 import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data) print(response.read()) import urllib.request response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout = 0.1) except urllib.error.urlerror as e: if isinstance(e.reason, socket.timeout): print('time out') # 响应类型 import urllib.request response = urllib.request.urlopen('http://www.python.org') print(type(response)) # 状态码、响应头 import urllib.request response = urllib.request.urlopen('http://www.python.org') print(response.status) print(response.getheaders()) print(response.getheader('server')) # request import urllib.request request = urllib.request.request('http://python.org') response = urllib.request.urlopen(request) print(response.read().decode('utf-8')) from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'user-agent': 'user-agent: mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.102 safari/537.36', 'host':'httpbin.org' } dict = { 'name':'germey' } data = bytes(parse.urlencode(dict), encoding = 'utf-8') req = request.request(url = url, data = data, headers = headers, method = 'post') response = request.urlopen(req) print(response.read().decode('utf-8')) from urllib import request, parse url = 'http://httpbin.org/post' dict = { 'name': 'germey' } data = bytes(parse.urlencode(dict), encoding = 'utf-8') req = request.request(url = url, data = data, method = 'post') req.add_header('user-agent', 'user-agent: mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.102 safari/537.36') response = request.urlopen(req) print(response.read().decode('utf-8')) #代理 import urllib.request proxy_handler = urllib.request.proxyhandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbon.org/get') print(response.read()) # cookie import http.cookiejar, urllib.request cookie = http.cookiejar.cookiejar() handler = urllib.request.httpcookieprocessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') for item in cookie: print(item.name + " = " + item.value) # 保存cookie为1.txt import http.cookiejar, urllib.request filename = '1.txt' cookie = http.cookiejar.mozillacookiejar(filename) handler = urllib.request.httpcookieprocessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard = true, ignore_expires = true) # 另外一种方式保存cookie import http.cookiejar, urllib.request filename = '1.txt' cookie = http.cookiejar.lwpcookiejar(filename) handler = urllib.request.httpcookieprocessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard = true, ignore_expires = true) # 读取cookie import http.cookiejar, urllib.request cookie = http.cookiejar.lwpcookiejar() cookie.load('1.txt', ignore_discard = true, ignore_expires = true) handler = urllib.request.httpcookieprocessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8')) # 异常处理 from urllib import request, error try: response = request.urlopen('http://lidonghao.com') except error.urlerror as e: print(e.reason) from urllib import request, error try: response = request.urlopen('http://www.baidu.com/101') except error.httperror as e: print(e.reason, e.code, sep = '\n') except error.urlerror as e: print(e.reason) else: print('request successfully') import socket import urllib.request import urllib.error try: response = urllib.request.urlopen("https://www.baidu.com", timeout = 0.01) except urllib.error.urlerror as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print("time out")
1 # 解析url 2 # urlparse 3 from urllib.parse import urlparse 4 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') 5 print(type(result), result) 6 7 from urllib.parse import urlparse 8 result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme = "https") 9 print(result) 10 11 from urllib.parse import urlparse 12 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme = "https") 13 print(result) 14 15 from urllib.parse import urlparse 16 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments = false) 17 print(result) 18 19 from urllib.parse import urlparse 20 result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments = false) 21 print(result)
1 # urlunparse 2 from urllib.parse import urlunparse 3 data = ['http', 'www.baidu.com', 'index,html', 'user', 'a=6', 'comment'] 4 print(urlunparse(data)) 5 6 # urljoin 7 from urllib.parse import urljoin 8 print(urljoin('http://www.baidu.com', 'faq.html')) 9 print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/faq.html')) 10 print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/faq.html')) 11 print(urljoin('http://www.baidu.com/about.html', 'http://cuiqingcai.com/faq.html?question=2')) 12 print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) 13 print(urljoin('http://www.baidu.com', '?category=2#comment')) 14 print(urljoin('www.baidu.com', '?category=2#comment')) 15 print(urljoin('www.baidu.com#comment', '?category=2')) 16 17 # urlencode 18 from urllib.parse import urlencode 19 params = { 20 'name':'germey', 21 'age':22 22 } 23 base_url = 'http://www.baidu.com' 24 url = base_url + urlencode(params) 25 print(url)
上一篇: 爆逗,这波夫妻的小日子闹得欢
推荐阅读
-
Python2和Python3中urllib库中urlencode的使用注意事项
-
python3解析库lxml的安装与基本使用
-
python爬虫之urllib库常用方法用法总结大全
-
Python标准库urllib2的一些使用细节总结
-
使用Python的urllib和urllib2模块制作爬虫的实例教程
-
使用Python编写爬虫的基本模块及框架使用指南
-
Python中使用urllib2模块编写爬虫的简单上手示例
-
零基础写python爬虫之urllib2使用指南
-
零基础写python爬虫之使用urllib2组件抓取网页内容
-
零基础写python爬虫之urllib2中的两个重要概念:Openers和Handlers