Python爬虫入门笔记
程序员文章站
2024-02-19 18:41:10
...
Python爬虫入门笔记
读取文件匹配格式并写入文件
import re
data=open("f:/source.txt","r").read()
pat="<div class=\"name\">(.*?)</div>"
res=re.compile(pat).findall(data)
fh=open("f:/newtxt.txt","w")
for i in res:
fh.write(i+"\n")
fh.close()
构造测试URL
import urllib.request
keywd="the"
keywd=urllib.request.quote(keywd)
url="http://www.baidu.com/s?ie=UTF-8&wd="+keywd
req=urllib.request.Request(url)
data=urllib.request.urlopen(req).read()
fh1=open("f:/1.html","wb")
fh1.write(data)
fh1.close()
构造URL post
import urllib.request
import urllib.parse
url="https://www.iqianyue.com/mypost"
mydata=urllib.parse.urlencode({"name":"[email protected]","pass":"123456"}).encode("utf-8")
req=urllib.request.Request(url,mydata)
#req.add_header
data=urllib.request.urlopen(req).read()
fh=open("f:/python study/3.html","wb")
fh.write(data)
fh.close()
测试url retreve捕获网页至本地
import urllib.request
urllib.request.urlretrieve("https://www.hellobi.com/",filename="f:/python study/4.html")
output
('f:/python study/4.html', <http.client.HTTPMessage at 0x1fb9efc2c88>)
测试urlopen的timeout
import urllib.request
#data=urllib.request.urlopen("https://www.hellobi.com/",timeout=0.244)
for i in range(0,100):
try:
file=urllib.request.urlopen("https://www.hellobi.com/",timeout=0.244)
print(file.getcode())
except Exception as e:
print("出现异常"+str(e)+str(file.getcode()))