欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬虫学习之路(一)—— 使用Urllib爬取网页

程序员文章站 2022-03-02 20:26:13
...

1.快速体验之最基础使用:urllib.request.urlopen(url)

import urllib.request
url = "https://m.weibo.cn/"
file = urllib.request.urlopen(url)
data = file.read()
dataline = file.readline()

print(data)
2.以网页的形式保存
file = urllib.request.urlopen(url)
data = file.read()
dataline = file.readline()

fhandle = open("C:/Python3/web/review/weibo.html","wb")
fhandle.write(data)
fhandle.close()

filename = urllib.request.urlretrieve(url,filename="C:/Python3/web/review/weibo2.html")
urllib.request.urlcleanup()


2.headers


1.build_opener()

headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
fhandle = open("C:/Python3/web/review/weibo3.html","wb")
fhandle.write(data)
fhandle.close()

2.add_header()
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)"
                        "AppleWebKit/537.36 (KHTML, like Gecko)"
                        "Chrome/65.0.3325.181 Safari/537.36")
data = urllib.request.urlopen(req).read()

3.timeout

url = "http://www.baidu.com" 
for i in range(1,100):
    try:
        file = urllib.request.urlopen(url,timeout = 1)
        data = file.read()
        print (len(data))
    except Exception as e:
        print("出现异常-->"+str(e))

4.HTTP协议请求实战

1.GET

import urllib.request
filepath = "C:/Python3/web/second/get你好.html"
keywd = "你好"# "hello"
key_code = urllib.request.quote(keywd)
url = "http://www.baidu.com/s?wd="+key_code
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open(filepath,"wb")
fhandle.write(data)
fhandle.close()
2.POST
import urllib.request
import  urllib.parse
filename = "C:/Python3/web/second/post.html"
url = "http://iqianyue.com/mypost/"
postdata = urllib.parse.urlencode({
    "name":"mmm",
    "pass":"1111"
}).encode('utf-8')#url编码,设置为utf-8
req = urllib.request.Request(url,postdata)
data = urllib.request.urlopen(req).read()
fhandle = open(filename,"wb")
fhandle.write(data)
fhandle.close()

5.设置代理服务器


# import urllib.request
def use_proxy(proxy_addr,url):
    import urllib.request
    proxy = urllib.request.ProxyHandler({'http':proxy_addr})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data =urllib.request.urlopen(url).read().decode('utf-8')
    return data
proxy_addr = "114.99.30.195:18118"
url = "http://www.baidu.com"
# data_der = urllib.request.urlopen(url).read().decode('utf-8')
data = use_proxy(proxy_addr,url)
print(len(data))

6.DeBugLog


import urllib.request
url = "http://www.baidu.com"
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url)


7.URLError

import urllib.request
import urllib.error
url = "http://www.baiduuuuuu.com"
try:
    urllib.request.urlopen(url)
# except urllib.error.HTTPError as e:
#     print(e.code)
#     print(e.reason)
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
        print(e.reason)
    if hasattr(e,"reason"):
        print(e.reason)








相关标签: python