python 基础 网络爬虫 day03
程序员文章站
2022-01-28 21:53:17
...
目录
day02
1.关于正则解析
- 分组(想要抓取什么内容就要加小括号())
- 正则方法
p = re.compile('...')
r_list = p.findall(html)
结果 :[(),(),(),()] - 贪婪匹配 : .*
- 非贪婪匹配 :.*?
2.抓取步骤
- 找URL
- 写正则表达式
- 定义类,写程序框架
- 补全代码
3.存入csv文件
- Import csv
with open('xxx.csv','a',newline="",encoding="") as f:
writer = csv.writer(f)
writer.writerow([...,...,...])
4.Fiddler常用菜单
- Inspector:请求,响应两部分
- 常用选项
- Headers
- WebForms
- Raw:请求 --纯文本
5.cookie 和 session
- cookie:客户端
- session:web服务器端
6.请求方式
- GET
- POST
- Cookie模拟登陆
- 先登录成功1次,利用抓包工具抓取到Cookie
- 将Request Header(包含cookie),处理为字典,作为参数发请求
7.安装模块
- Aanconda Prompt : conda install 模块名
- Windows cmd:python -m pip install 模块名
8.requests模块
- get(url,params=params,headers=headers)
params:查询参数,字典,不用编码,不用URL拼接 - post(url,data=data,headers=headers)
data:Form表单数据,字典,不用编码,不用转码 - 响应对象属性
- encoding:响应字符编码,res.encoding='utf-8'
- text:字符串
- content:字节流
- status_code:响应码
- url:返回实际数据的URL
- 非结构化数据保存
html = res.content
with open("XXX","wb") as f:
f.write(html)
day03
1.requests模块
- 代理(参数名:proxies)
- 获取代理ip的网站
西刺代理网站
快代理
全网代理 - 普通代理
- proxies = {'协议':'协议://IP地址:端口号'}
proxies = {'http':'http://203.86.26.9:3128'}'''01_普通代理示例.py''' import requests url = "http://www.baidu.com/" proxies = {"http":"http://183.129.207.82:11597"} headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,proxies=proxies,headers=headers) print(res.status_code)
- proxies = {'协议':'协议://IP地址:端口号'}
- 私密代理
proxies = {"http":"http://309435365:[email protected]:16817"}'''02_私密代理示例.py''' import requests url = "http://httpbin.org/get" headers = {"User-Agent":"Mozilla/5.0"} proxies = {"http":"http://309435365:[email protected]:16817"} res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.text)
pymysql 和 pymongo回顾示例:
'''创建一个mysql库spiderdb,创建表t1,插入1条记录''' import pymysql import warnings # 创建数据库连接对象 db = pymysql.connect("localhost","root", "123456",charset="utf8") # 创建游标对象 cursor = db.cursor() # 执行语句 # 过滤警告 warnings.filterwarnings("ignore") try: cursor.execute("create database if not exists spiderdb") cursor.execute("use spiderdb") cursor.execute("create table if not exists t1(id int)") except Warning: pass ins = "insert into t1 values(%s)" cursor.execute(ins,[1]) cursor.execute(ins,[2]) # 提交 db.commit() # 关闭 cursor.close() db.close() ---------------------------------------------------------------------------------------- '''04_pymongo回顾.py''' import pymongo # 创建连接对象 conn = pymongo.MongoClient("localhost",27017) # 创建数据库对象,spiderdb为库的名字 db = conn.spiderdb # 利用数据库对象创建集合对象 myset = db.t1 # 执行插入 myset.insert({"name":"Tom"}) show dbs use spiderdb show tables db.t1.find().pretty()
- 获取代理ip的网站
- 案例1:爬取链家二手房信息 --> 存到MySQL数据库中
- 找URL:https://bj.lianjia.com/ershoufang/pg/
- 正则<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S
- 写代码
mongo数据库插入:'''05_链家数据ToMongo.py''' import requests import re import pymongo class LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent":"Mozilla/5.0"} self.proxies = {"http":"http://309435365:[email protected]:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Lianjia self.myset = self.db.housePrice def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("页面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S) r_list = p.findall(html) # [("天通苑","480","万"),()..] print("页面解析完成,正在存入数据库...") self.writeTomongo(r_list) def writeTomongo(self,r_list): for r_tuple in r_list: D = {"houseName":r_tuple[0].strip(),\ "totalPrice":float(r_tuple[1].strip())*10000} self.myset.insert(D) print("存入数据库成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianjiaSpider() spider.workOn()
mysql数据库插入:'''05_链家数据ToMongo.py''' import requests import re import pymysql import warnings class LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent":"Mozilla/5.0"} self.proxies = {"http":"http://309435365:[email protected]:16817"} self.db = pymysql.connect("localhost", "root","123456",charset="utf8") self.cursor = self.db.cursor() def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("页面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S) r_list = p.findall(html) # [("天通苑","480","万"),()..] print("页面解析完成,正在存入数据库...") self.writeTomysql(r_list) def writeTomysql(self,r_list): c_db = "create database if not exists Lianjiadb \ character set utf8" u_db = "use Lianjiadb" c_tab = "create table if not exists housePrice( \ id int primary key auto_increment,\ housename varchar(50), \ totalprice int)charset=utf8" warnings.filterwarnings("ignore") try: self.cursor.execute(c_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except Warning: pass ins = "insert into housePrice(housename,totalprice) \ values(%s,%s)" for r_tuple in r_list: name = r_tuple[0].strip() price = float(r_tuple[1].strip())*10000 L = [name,price] self.cursor.execute(ins,L) self.db.commit() print("存入数据库成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: self.cursor.close() self.db.close() print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianjiaSpider() spider.workOn()
趁热打铁day02猫眼电影存入mysql'''06_猫眼电影top100抓取.py''' import requests import re import pymysql import warnings class MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:[email protected]:16817"} self.db = pymysql.connect("localhost","root","123456","Lianjiadb",charset="utf8") self.cursor = self.db.cursor() # 下载页面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析页面 def parsePage(self,html): p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # print(r_list) # [("霸王别姬","张国荣","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): c_tab = "create table if not exists top100( \ id int primary key auto_increment,\ name varchar(50),\ star varchar(100),\ releasetime varchar(50)\ )charset=utf8" ins = "insert into top100(name,star,releasetime) \ values(%s,%s,%s)" # 过滤警告 warnings.filterwarnings("ignore") try: self.cursor.execute(c_tab) except Warning: pass for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() L = [name,star,releasetime] self.cursor.execute(ins,L) self.db.commit() print("存入数据库成功") def workOn(self): while True: c = input("爬取请按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn()
存入mongo:
'''06_猫眼电影top100抓取.py''' import requests import re import pymongo class MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:[email protected]:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Film self.myset = self.db.top100 # 下载页面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析页面 def parsePage(self,html): p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # print(r_list) # [("霸王别姬","张国荣","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() D = {"name":name, "star":star, "releasetime":releasetime} self.myset.insert(D) print("存入数据库成功") def workOn(self): while True: c = input("爬取请按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn()
- Web客户端验证(参数名:auth)
- auth=('用户名','密码')
auth=('tarenacode','code_2013') - 案例:
'''09_Web客户端验证.py''' import requests import re class NoteSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/5.0"} self.url = "http://code.tarena.com.cn/" self.proxies = {"http":"http://309435365:[email protected]:16817"} # auth参数存储用户名和密码(必须为元组) self.auth = ("tarenacode","code_2013") def getParsePage(self): res = requests.get(self.url, proxies=self.proxies, headers=self.headers, auth=self.auth, timeout=3) res.encoding = "utf-8" html = res.text # print(html) p = re.compile('<a href=".*?>(.*?)</a>',re.S) r_list = p.findall(html) # print(r_list) self.writePage(r_list) def writePage(self,r_list): print("开始写入文件...") with open("达内科技.txt","a") as f: for r_str in r_list: f.write(r_str + "\n\n") print("写入成功") if __name__ == "__main__": spider = NoteSpider() spider.getParsePage()
- auth=('用户名','密码')
- SSL证书认证(参数名:verify)
- verify = True:默认,进行SSL证书认证
- verify = False:不做认证
'''10_SSL证书认证示例.py''' import requests url = "https://www.12306.cn/mormhweb/" headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,headers=headers,verify=False) res.encoding = "utf-8" print(res.text)
2.urllib.request 中Handler处理器
- 定义
自定义的urlopen()方法,urlopen()方法是一个特殊的opener(模块已定义好),不支持代理等功能,通过Handler处理器对象来自定义opener对象 - 常用方法
- build_opener(Handler处理器对象):创建opener对象
- opener.open(url,参数)
- 使用流程
- 创建相关的Handler处理器对象
http_handler = urllib.request.HTTPHandler() - 创建自定义opener对象
opener = urllib.request.build_opener(http_handler) - 利用opener对象打开url
req = urllib.request.Request(url,headers=headers)
res = opener.open(req)'''Handler处理器示例.py''' import urllib.request url = "http://www.baidu.com/" # 创建Handler处理器对象 http_handler = urllib.request.HTTPHandler() #proxy_handler = urllib.request.ProxyHandler() # 创建自定义的opener对象 opener = urllib.request.build_opener(http_handler) # 利用opener对象的open()方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- 创建相关的Handler处理器对象
- Handler处理器分类
- HTTPHandler():没有任何特殊功能
- ProxyHandler(普通代理)
代理:{'协议':'IP地址:端口号'}'''12_ProxyHandler示例.py''' import urllib.request url = "http://www.baidu.com/" proxy = {"http":"127.0.0.1:8888"} # 创建Handler处理器对象 pro_hand = urllib.request.ProxyHandler(proxy) # 创建自定义opener对象 opener = urllib.request.build_opener(pro_hand) # opener对象open方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyBasicAuthHandler(密码管理器对象):私密代理
- HTTPBasicAuthHandler(密码管理器对象):web客户端认证
- 密码管理器对象作用
- 私密代理
- Web客户端认证
- 程序实现流程
- 创建密码管理器对象
pwdmg = urllib.request.HTTPPasswordMgrWithDefaultRealm() - 把认证信息添加到密码管理器
pwdmg.add_password(None,webserver,user,passwd) - 创建Handler处理器对象
- 私密代理
- proxy = urllib.request.ProxyBasicAuthHandler(pwdmg)
- Web客户端
webbasic = urllib.request.HTTPBasicAuthHandler(pwdmg)
- 私密代理
- 创建密码管理器对象
安装:
- Windows :安装selenium
Anaconda Prompt下执行 : python -m pip install selenium - Ubuntu :安装Scrapy框架
#### 依赖库较多,以下为全部依赖库,有些已安装 ####- sudo apt-get install libssl-dev
sudo apt-get install libffi-dev
sudo apt-get install python3-dev
sudo apt-get install build-essential
sudo apt-get install libxml2
sudo apt-get install libxml2-dev
sudo apt-get install libxslt1-dev
sudo apt-get install zlib1g-dev
- sudo pip3 install Scrapy
- sudo apt-get install libssl-dev
今日代码