欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python3爬虫-通过selenium获取到dj商品

程序员文章站 2022-06-24 14:13:02
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.sup... ......
from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support.wait import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import options
from selenium.common.exceptions import nosuchelementexception
from lxml import etree
import time, json

jd_url_login = "https://www.jd.com/"


class customizeexception(exception):
    def __init__(self, status, msg):
        self.status = status
        self.msg = msg


class jd:
    def __init__(self):
        self.browser = none
        self.__init_browser()

    def __init_browser(self):
        options = options()
        options.add_argument("--headless")
        options.add_experimental_option('excludeswitches', ['enable-automation'])
        # 设置为无图模式
        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        self.browser = webdriver.chrome(options=options)
        # 设置浏览器最大化窗口
        self.browser.maximize_window()
        # 隐式等待时间为3s
        self.browser.implicitly_wait(3)
        self.browser.get(jd_url_login)
        self.wait = webdriverwait(self.browser, 10)

    def __search_goods(self, goods):
        '''搜索商品的方法'''
        self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")
        self.wait.until(ec.presence_of_all_elements_located((by.id, "key")))
        serach_input = self.browser.find_element_by_id("key")
        serach_input.clear()
        serach_input.send_keys(goods, keys.enter)

    def __get_goods_info(self, page_source):
        '''从网页源码中获取到想要的数据'''
        selector_html = etree.html(page_source)
        # 商品名字 不要获取title属性,以后再改吧,最好是获取到商品名的文本内容
        goods_name = selector_html.xpath("//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title")

        # 商品价格
        goods_price = selector_html.xpath("//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()")

        # 商品评价数量
        comment_num_selector = selector_html.xpath("//div[@class='p-commit']/strong")
        comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]

        # 商品店铺
        shop_name = selector_html.xpath("//a[@class='curr-shop']/text()")

        goods_zip = zip(goods_name, goods_price, comment_num, shop_name)
        for goods_info in goods_zip:
            dic = {}
            dic["goods_name"] = goods_info[0]
            dic["goods_price"] = goods_info[1]
            dic["comment_num"] = goods_info[2]
            dic["shop_name"] = goods_info[3]
            # print("商品名字>>:", goods_info[0])
            # print("商品价格>>:", goods_info[1])
            # print("商品评价数量>>:", goods_info[2])
            # print("商品店铺>>:", goods_info[3])
            # print("*" * 100)
            yield dic

    def __swipe_page(self):
        '''上下滑动页面,将完整的网页源码返回'''
        height = self.browser.execute_script("return document.body.scrollheight;")
        js = "window.scrollto(0, {});".format(height)
        self.browser.execute_script(js)
        while true:
            time.sleep(1)
            now_height = self.browser.execute_script("return document.body.scrollheight;")
            if height == now_height:
                return self.browser.page_source
            js = "window.scrollto({}, {});".format(height, now_height)
            self.browser.execute_script(js)
            height = now_height

    def __is_element_exists(self, xpath):
        '''检测一个xpath是否能够找到'''
        try:
            self.browser.find_element_by_xpath(xpath=xpath)
            return true
        except nosuchelementexception:
            return false

    def __click_next_page(self):
        '''点击下一页,实现翻页功能'''
        self.wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next")))
        xpath = "//a[@class='pn-next']"
        if not self.__is_element_exists(xpath):
            raise customizeexception(10000, "该商品访问完毕")
        self.browser.find_element_by_xpath(xpath).click()

    def __write_to_json(self, dic: dict):
        data_json = json.dumps(dic, ensure_ascii=false)
        self.file.write(data_json + "\n")

    def run(self, goods):
        self.__search_goods(goods)
        n = 1
        while true:
            print("正在爬取商品 <{}>---第{}页......".format(goods, n))
            time.sleep(3)
            html = self.__swipe_page()
            for dic in self.__get_goods_info(html):
                self.__write_to_json(dic)
            try:
                self.__click_next_page()
            except customizeexception:
                try:
                    goods = goods_list.pop(0)
                    self.run(goods)
                except indexerror:
                    return
            n += 1

    def __del__(self):
        self.browser.close()
        self.file.close()


if __name__ == '__main__':
    jd = jd()
    goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露",
                  "牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"]
    try:
        goods = goods_list.pop(0)
    except indexerror:
        raise customizeexception(20000, "goods_list不能为空")
    try:
        jd.run(goods)
    finally:
        del jd