python3爬虫-通过selenium获取到dj商品
程序员文章站
2022-03-20 19:25:45
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.sup... ......
from selenium import webdriver from selenium.webdriver.common.by import by from selenium.webdriver.common.keys import keys from selenium.webdriver.support.wait import webdriverwait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.chrome.options import options from selenium.common.exceptions import nosuchelementexception from lxml import etree import time, json jd_url_login = "https://www.jd.com/" class customizeexception(exception): def __init__(self, status, msg): self.status = status self.msg = msg class jd: def __init__(self): self.browser = none self.__init_browser() def __init_browser(self): options = options() options.add_argument("--headless") options.add_experimental_option('excludeswitches', ['enable-automation']) # 设置为无图模式 options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) self.browser = webdriver.chrome(options=options) # 设置浏览器最大化窗口 self.browser.maximize_window() # 隐式等待时间为3s self.browser.implicitly_wait(3) self.browser.get(jd_url_login) self.wait = webdriverwait(self.browser, 10) def __search_goods(self, goods): '''搜索商品的方法''' self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8") self.wait.until(ec.presence_of_all_elements_located((by.id, "key"))) serach_input = self.browser.find_element_by_id("key") serach_input.clear() serach_input.send_keys(goods, keys.enter) def __get_goods_info(self, page_source): '''从网页源码中获取到想要的数据''' selector_html = etree.html(page_source) # 商品名字 不要获取title属性,以后再改吧,最好是获取到商品名的文本内容 goods_name = selector_html.xpath("//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title") # 商品价格 goods_price = selector_html.xpath("//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()") # 商品评价数量 comment_num_selector = selector_html.xpath("//div[@class='p-commit']/strong") comment_num = [selector.xpath("string(.)") for selector in comment_num_selector] # 商品店铺 shop_name = selector_html.xpath("//a[@class='curr-shop']/text()") goods_zip = zip(goods_name, goods_price, comment_num, shop_name) for goods_info in goods_zip: dic = {} dic["goods_name"] = goods_info[0] dic["goods_price"] = goods_info[1] dic["comment_num"] = goods_info[2] dic["shop_name"] = goods_info[3] # print("商品名字>>:", goods_info[0]) # print("商品价格>>:", goods_info[1]) # print("商品评价数量>>:", goods_info[2]) # print("商品店铺>>:", goods_info[3]) # print("*" * 100) yield dic def __swipe_page(self): '''上下滑动页面,将完整的网页源码返回''' height = self.browser.execute_script("return document.body.scrollheight;") js = "window.scrollto(0, {});".format(height) self.browser.execute_script(js) while true: time.sleep(1) now_height = self.browser.execute_script("return document.body.scrollheight;") if height == now_height: return self.browser.page_source js = "window.scrollto({}, {});".format(height, now_height) self.browser.execute_script(js) height = now_height def __is_element_exists(self, xpath): '''检测一个xpath是否能够找到''' try: self.browser.find_element_by_xpath(xpath=xpath) return true except nosuchelementexception: return false def __click_next_page(self): '''点击下一页,实现翻页功能''' self.wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next"))) xpath = "//a[@class='pn-next']" if not self.__is_element_exists(xpath): raise customizeexception(10000, "该商品访问完毕") self.browser.find_element_by_xpath(xpath).click() def __write_to_json(self, dic: dict): data_json = json.dumps(dic, ensure_ascii=false) self.file.write(data_json + "\n") def run(self, goods): self.__search_goods(goods) n = 1 while true: print("正在爬取商品 <{}>---第{}页......".format(goods, n)) time.sleep(3) html = self.__swipe_page() for dic in self.__get_goods_info(html): self.__write_to_json(dic) try: self.__click_next_page() except customizeexception: try: goods = goods_list.pop(0) self.run(goods) except indexerror: return n += 1 def __del__(self): self.browser.close() self.file.close() if __name__ == '__main__': jd = jd() goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露", "牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"] try: goods = goods_list.pop(0) except indexerror: raise customizeexception(20000, "goods_list不能为空") try: jd.run(goods) finally: del jd
上一篇: Python头脑风暴4