欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python学习笔记(二十二)爬虫基础(2):模拟浏览器,ajax动态爬取,爬取数据写入文件、图片爬虫

程序员文章站 2022-05-05 15:41:50
...

目录

一、将爬取到的数据写入文件

二、模拟浏览器

三、动态ajax爬取

四、图片爬虫


 

一、将爬取到的数据写入文件

import urllib.request

urllib.request.urlretrieve("http://www.baidu.com",filename=r"F:\untitled\爬虫\爬虫\file\file2.html")

#urlretrieve在执行的过程中会产生缓存
#清除缓存
urllib.request.urlcleanup()

二、模拟浏览器

import urllib.request
import random

url = "http://www.baidu.com"
'''
#模拟请求头
headers = {
	"Accept":"application/json,text/javascript,q=0.01",
	"X-Requested-With":"XMLHttpRequest",
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36",
	"Content-Type":"application/x-www-form-urlencoded;charset=utf-8"
}

#设置一个请求体
req = urllib.request.Request(url,headers=headers)
#发起请求
response = urllib.request.urlopen(req)
data = response.read().decode("utf-8")
print(data)
'''
agentList = [
	"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
	"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
	"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
	"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
	"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
	"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"
]
agentStr = random.choice(agentList)
req = urllib.request.Request(url)
#向请求体里添加User-Agent
req.add_header("User-Agent",agentStr)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

三、动态ajax爬取

import urllib.request
import ssl
import json

def ajaxCrawler(url):
	headers = {
		"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
	}
	req = urllib.request.Request(url,headers=headers)

	#使用ssl创建未验证的上下文
	context = ssl._create_unverified_context()
	response = urllib.request.urlopen(req,context=context)

	jsonStr = response.read().decode("utf-8")
	jsonData = json.loads(jsonStr)


	return jsonData


# url = "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=20&limit=20"
# info = ajaxCrawler(url)
# print(info)

for i in (1,11):
	url = "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start="+str(i*20)+"&limit=20"
	info = ajaxCrawler(url)
	print(len(info))

四、图片爬虫

import urllib.request
import re
import os

def imgCrawler(url,toPath):
	headers = {
		'User-agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
	}

	req = urllib.request.Request(url,headers=headers)
	response = urllib.request.urlopen(req)
	HtmlStr = response.read().decode("utf-8")
	# with open(r"F:\学习存放\untitled\爬虫\img\img.html","wb") as f:
	# 	f.write(HtmlStr)

	pat = r'<img src="//(.*?)"/>'
	#pat = r'<img src="//(.*?)"/>'
	re_image = re.compile(pat, re.S)
	imgList = re_image.findall(HtmlStr)
	num = 1

	for imageUrl in imgList:
		path = os.path.join(toPath, str(num)+".jpg")
		num += 1
		#把图片下载到本地存储
		urllib.request.urlretrieve("http://"+imageUrl,filename=path)

url = 'https://search.yhd.com/c0-0/k%25E6%25AD%25BB%25E5%25BA%2593%25E6%25B0%25B4/'
toPath =r'F:\untitled\爬虫\img'
imgCrawler(url,toPath)

 

上一篇: Jenkins Pipeline

下一篇: pipeline注释