python学习笔记(二十二)爬虫基础(2):模拟浏览器,ajax动态爬取,爬取数据写入文件、图片爬虫
程序员文章站
2022-05-05 15:41:50
...
目录
一、将爬取到的数据写入文件
import urllib.request
urllib.request.urlretrieve("http://www.baidu.com",filename=r"F:\untitled\爬虫\爬虫\file\file2.html")
#urlretrieve在执行的过程中会产生缓存
#清除缓存
urllib.request.urlcleanup()
二、模拟浏览器
import urllib.request
import random
url = "http://www.baidu.com"
'''
#模拟请求头
headers = {
"Accept":"application/json,text/javascript,q=0.01",
"X-Requested-With":"XMLHttpRequest",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded;charset=utf-8"
}
#设置一个请求体
req = urllib.request.Request(url,headers=headers)
#发起请求
response = urllib.request.urlopen(req)
data = response.read().decode("utf-8")
print(data)
'''
agentList = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"
]
agentStr = random.choice(agentList)
req = urllib.request.Request(url)
#向请求体里添加User-Agent
req.add_header("User-Agent",agentStr)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
三、动态ajax爬取
import urllib.request
import ssl
import json
def ajaxCrawler(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
req = urllib.request.Request(url,headers=headers)
#使用ssl创建未验证的上下文
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req,context=context)
jsonStr = response.read().decode("utf-8")
jsonData = json.loads(jsonStr)
return jsonData
# url = "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=20&limit=20"
# info = ajaxCrawler(url)
# print(info)
for i in (1,11):
url = "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start="+str(i*20)+"&limit=20"
info = ajaxCrawler(url)
print(len(info))
四、图片爬虫
import urllib.request
import re
import os
def imgCrawler(url,toPath):
headers = {
'User-agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
}
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
HtmlStr = response.read().decode("utf-8")
# with open(r"F:\学习存放\untitled\爬虫\img\img.html","wb") as f:
# f.write(HtmlStr)
pat = r'<img src="//(.*?)"/>'
#pat = r'<img src="//(.*?)"/>'
re_image = re.compile(pat, re.S)
imgList = re_image.findall(HtmlStr)
num = 1
for imageUrl in imgList:
path = os.path.join(toPath, str(num)+".jpg")
num += 1
#把图片下载到本地存储
urllib.request.urlretrieve("http://"+imageUrl,filename=path)
url = 'https://search.yhd.com/c0-0/k%25E6%25AD%25BB%25E5%25BA%2593%25E6%25B0%25B4/'
toPath =r'F:\untitled\爬虫\img'
imgCrawler(url,toPath)
上一篇: Jenkins Pipeline
下一篇: pipeline注释