Python爬虫项目--爬取自如网房源信息
程序员文章站
2022-06-19 12:18:56
本次爬取自如网房源信息所用到的知识点: 1. requests get请求 2. lxml解析html 3. Xpath 4. MongoDB存储 正文 1.分析目标站点 1. url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p参数控制分页 2. get请求 ......
本次爬取自如网房源信息所用到的知识点:
1. requests get请求
2. lxml解析html
3. xpath
4. mongodb存储
正文
1.分析目标站点
1. url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p参数控制分页
2. get请求
2.获取单页源码
1 # -*- coding: utf-8 -*- 2 import requests 3 import time 4 from requests.exceptions import requestexception 5 def get_one_page(page): 6 try: 7 url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page) 8 headers = { 9 'referer':'http://hz.ziroom.com/', 10 'upgrade-insecure-requests':'1', 11 'user-agent':'mozilla/5.0(windowsnt6.3;win64;x64)applewebkit/537.36(khtml,likegecko)chrome/68.0.3440.106safari/537.36' 12 } 13 res = requests.get(url,headers=headers) 14 if res.status_code == 200: 15 print(res.text) 16 except requestexception: 17 return none 18 def main(): 19 page = 1 20 get_one_page(page) 21 if __name__ == '__main__': 22 main() 23 time.sleep(1)
3.解析单页源码
1. 解析html文档, 目的: 测试xpath表达式
将获取的源码保存到当前文件夹下的"result.html"中, 然后通过xpath对其进行相应内容的提取, 当然你也可以使用某些在线工具.
1 from lxml import etree 2 #解析html文档 3 html = etree.parse("./resul.html",etree.htmlparser()) 4 results = html.xpath('//ul[@id="houselist"]/li') 5 for result in results[1:]: 6 title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) >5 else "" 7 location = result.xpath("./div/h4/a/text()")[0].replace("[","").replace("]",'') 8 area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1) #使用join方法将列表中的内容以" "字符连接 9 nearby = result.xpath("./div/div/p[2]/span/text()")[0] 10 print(title) 11 print(location) 12 print(area) 13 print(nearby)
2. 解析源代码
1 from lxml import etree 2 def parse_one_page(sourcehtml): 3 '''解析单页源码''' 4 contenttree = etree.html(sourcehtml) #解析源代码 5 results = contenttree.xpath('//ul[@id="houselist"]/li') #利用xpath提取相应内容 6 for result in results[1:]: 7 title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else "" 8 location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '') 9 area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法将列表中的内容以" "字符连接 10 nearby = result.xpath("./div/div/p[2]/span/text()")[0] 11 yield { 12 "title": title, 13 "location": location, 14 "area": area, 15 "nearby": nearby 16 } 17 def main(): 18 page = 1 19 html = get_one_page(page) 20 print(type(html)) 21 parse_one_page(html) 22 for item in parse_one_page(html): 23 print(item) 24 25 if __name__ == '__main__': 26 main() 27 time.sleep(1)
4.获取多个页面
1 def parse_one_page(sourcehtml): 2 '''解析单页源码''' 3 contenttree = etree.html(sourcehtml) #解析源代码 4 results = contenttree.xpath('//ul[@id="houselist"]/li') #利用xpath提取相应内容 5 for result in results[1:]: 6 title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else "" 7 location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '') 8 area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法将列表中的内容以" "字符连接 9 #nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句 10 nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else "" 11 yield { 12 "title": title, 13 "location": location, 14 "area": area, 15 "nearby": nearby 16 } 17 print(nearby) 18 #yield {"pages":pages} 19 def get_pages(): 20 """得到总页数""" 21 page = 1 22 html = get_one_page(page) 23 contenttree = etree.html(html) 24 pages = int(contenttree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页")) 25 return pages 26 def main(): 27 pages = get_pages() 28 print(pages) 29 for page in range(1,pages+1): 30 html = get_one_page(page) 31 for item in parse_one_page(html): 32 print(item) 33 34 if __name__ == '__main__': 35 main() 36 time.sleep(1)
5. 存储到mongodb中
需确保mongodb已启动服务, 否则必然会存储失败
1 def save_to_mongodb(result): 2 """存储到mongodb中""" 3 # 创建数据库连接对象, 即连接到本地 4 client = pymongo.mongoclient(host="localhost") 5 # 指定数据库,这里指定ziroom 6 db = client.iroomz 7 # 指定表的名称, 这里指定roominfo 8 db_table = db.roominfo 9 try: 10 #存储到数据库 11 if db_table.insert(result): 12 print("---存储到数据库成功---",result) 13 except exception: 14 print("---存储到数据库失败---",result)
6.完整代码
1 # -*- coding: utf-8 -*- 2 3 import requests 4 import time 5 import pymongo 6 from lxml import etree 7 from requests.exceptions import requestexception 8 def get_one_page(page): 9 '''获取单页源码''' 10 try: 11 url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page) 12 headers = { 13 'referer':'http://hz.ziroom.com/', 14 'upgrade-insecure-requests':'1', 15 'user-agent':'mozilla/5.0(windowsnt6.3;win64;x64)applewebkit/537.36(khtml,likegecko)chrome/68.0.3440.106safari/537.36' 16 } 17 res = requests.get(url,headers=headers) 18 if res.status_code == 200: 19 return res.text 20 return none 21 except requestexception: 22 return none 23 def parse_one_page(sourcehtml): 24 '''解析单页源码''' 25 contenttree = etree.html(sourcehtml) #解析源代码 26 results = contenttree.xpath('//ul[@id="houselist"]/li') #利用xpath提取相应内容 27 for result in results[1:]: 28 title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else "" 29 location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '') 30 area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法将列表中的内容以" "字符连接 31 #nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句 32 nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else "" 33 data = { 34 "title": title, 35 "location": location, 36 "area": area, 37 "nearby": nearby 38 } 39 save_to_mongodb(data) 40 #yield {"pages":pages} 41 def get_pages(): 42 """得到总页数""" 43 page = 1 44 html = get_one_page(page) 45 contenttree = etree.html(html) 46 pages = int(contenttree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页")) 47 return pages 48 def save_to_mongodb(result): 49 """存储到mongodb中""" 50 # 创建数据库连接对象, 即连接到本地 51 client = pymongo.mongoclient(host="localhost") 52 # 指定数据库,这里指定ziroom 53 db = client.iroomz 54 # 指定表的名称, 这里指定roominfo 55 db_table = db.roominfo 56 try: 57 #存储到数据库 58 if db_table.insert(result): 59 print("---存储到数据库成功---",result) 60 except exception: 61 print("---存储到数据库失败---",result) 62 63 def main(): 64 pages = get_pages() 65 print(pages) 66 for page in range(1,pages+1): 67 html = get_one_page(page) 68 parse_one_page(html) 69 70 if __name__ == '__main__': 71 main() 72 time.sleep(1)
7.最终结果
总结
1. 在第三步中xpath使用注意事项
title = result.xpath("./div/h3/a/text()") 此处的点'.'不能忘记, 它表示当前节点, 如果不加'.', '/'就表示从根节点开始选取
2. 在第四步获取多个页面时出现索引超出范围错误
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() indexerror: list index out of range
造成这种错误原因有两种:
1) [index] index超出list范围
2) [index] index索引内容为空
因为这里的nearby的index是0, 排除第一种情况, 那么这里就是空行了, 加句if判断就可以解决
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() #改写以后: nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
以上主要是对爬虫过程学习的总结, 若有不对的地方, 还请指正, 谢谢!
因还在学习中, 价格部分会用到图片识别, 有待完善.
推荐阅读
-
Python爬虫使用selenium爬取qq群的成员信息(全自动实现自动登陆)
-
网易云歌单信息爬取及数据分析(python爬虫)
-
Python爬虫实例:爬取B站《工作细胞》短评——异步加载信息的爬取
-
Python爬虫实战用 BeautifulSoup 爬取电影网站信息
-
Python爬虫项目 ,爬取豆瓣top250中影片信息
-
python网络爬虫之解析网页的XPath(爬取Path职位信息)[三]
-
一个简单的python爬虫程序 爬取豆瓣热度Top100以内的电影信息
-
python爬虫_微信公众号推送信息爬取的实例
-
python3爬虫-通过selenium登陆拉钩,爬取职位信息
-
使用python爬虫实现网络股票信息爬取的demo