python爬虫—“爱彼迎”:ERR_HTTP2_COMPRESSION_ERROR/网页可能暂时无法连接,或者它已永久性地移动到了新网址。
程序员文章站
2022-07-14 21:12:41
...
爱彼迎
被爱彼迎制裁的半死不活,用requests什么数据都返回不了,甚至自己用chrome浏览器打开爱彼迎搜索以后,也会一直报错
找了半天原因,也按照很多教程做了尝试,都没办法解决。只要打开新页面,就会报错,刷新一下内容才能出来。
换成Firefox后根本打不开
看来也只能用selenium打开一次,刷新一次了
既然使用了selenium,那就试试从主页进去搜索城市的功能吧。
path = r'/Users/chenbaba/Desktop/python/chromedriver'
browser = webdriver.Chrome(executable_path=path)
wait = WebDriverWait(browser, 10)
#找到搜索框,输入城市,并点击“搜索”按钮
def parse_bnb(url,kw):
browser.get(url)
browser.implicitly_wait(10)
city = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#Koan-via-HeroController__input")))
city.send_keys(kw)
login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#ChinaSearchBarWithDate > form > div._mv0xzc > button")))
login.click()
搜索以后的结果只有15-17页,找到页面中的“下一页”按钮,点它~
next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#site-content > div > div > div._1kss53yu > div > div > div > div._1ou8uzt > nav > ul > li._i66xk8d > a")))
next_page.click()
selenium遇到ERR_HTTP2_COMPRESSION_ERROR时,报错为
selenium.common.exceptions.TimeoutException 或者
selenium.common.exceptions.WebDriverException
使用try/except即可,遇到报错刷新页面再来一遍即可
except (selenium.common.exceptions.TimeoutException,selenium.common.exceptions.WebDriverException):
browser.refresh()
完整代码如下:
import requests
import pymongo
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
path = r'/Users/chenbaba/Desktop/python/chromedriver'
browser = webdriver.Chrome(executable_path=path)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
wait = WebDriverWait(browser, 10)
#存入mongodb
client = pymongo.MongoClient('localhost',27017)
mydb = client['Real_estatedb']
airbnb = mydb['airbnb']
#这里为函数传入参数m,主要是为了页面刷新后到最后一页能及时关闭
def parse_info(m):
for i in range (m,17):
try:
res = browser.page_source
l = etree.HTML(res)
infos = l.xpath('//div[@class="_fhph4u"]/div')
#每条信息中的内容都不太一样,所以可能出现xpath找不到相关信息的情况,先将可能没有的值都设置为空,然后在使用xpath时判断一下即可
img_src = ''
score = ''
comments = ''
type = ''
room = ''
price = ''
comments = ''
for info in infos:
if info.xpath('./div/div/meta[1]/@content'):
title = info.xpath('./div/div/meta[1]/@content')[0]
if info.xpath('.//img[@class="_9ofhsl"]/@src'):
img_src = str(info.xpath('.//img[@class="_9ofhsl"]/@src')).strip('[]')
if info.xpath('.//span[@class="_1clmxfj"]/text()'):
score = info.xpath('.//span[@class="_1clmxfj"]/text()')[0]
if info.xpath('.//div[@class="_1etkxf1"]/span/span/text()[1]'):
type = info.xpath('.//div[@class="_1etkxf1"]/span/span/text()[1]')[0]
if info.xpath('.//div[@class="_1etkxf1"]/span/span/text()[2]'):
room = info.xpath('.//div[@class="_1etkxf1"]/span/span/text()[2]')[0]
if info.xpath('.//div[@class="_1ixtnfc"]/span[2]/text()'):
price = info.xpath('.//div[@class="_1ixtnfc"]/span[2]/text()')[0]
if info.xpath('.//div[@class="_11jctj9"]/span/span/text()'):
comments = info.xpath('.//div[@class="_11jctj9"]/span/span/text()')[0]
print(title,price)
data = {
'标题':title,
'评分':score,
'类型':type,
'厅室数量':room,
'价格(元/晚)':price,
'评论':comments,
'图片链接':img_src
}
airbnb.insert_one(data)
next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"#site-content > div > div > div._1kss53yu > div > div > div > div._1ou8uzt > nav > ul > li._i66xk8d > a")))
next_page.click()
m=i
except (selenium.common.exceptions.TimeoutException,selenium.common.exceptions.WebDriverException):
browser.refresh()
return parse_info(m)
def parse_bnb(url,kw):
browser.get(url)
browser.implicitly_wait(10)
city = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#Koan-via-HeroController__input")))
city.send_keys(kw)
login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#ChinaSearchBarWithDate > form > div._mv0xzc > button")))
login.click()
parse_info(1)
def main():
url = 'https://www.airbnb.cn/'
# kw为想查询的城市
kw = '西安'
parse_bnb(url,kw)
browser.quit()
if __name__ == '__main__':
main()
大功告成!
初学,代码有点繁琐,见谅~
下一篇: jfinal配置servlet