使用scrapy框架,用模拟浏览器的方法爬取京东上面膜信息,并存入mysql,sqlite,mongodb数据库

程序员文章站 2022-03-02 23:24:47

...

因为京东的页面是由JavaScript动态加载的所以使用模拟浏览器的方法进行爬取,具体代码如下 :

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from jdpro.items import jdproItem

num = 0
class MaskSpider(scrapy.Spider):
    name = 'mask'
    allowed_domains = ['list.jd.com']

    def __init__(self):
        self.urls = [
            "https://list.jd.com/list.html?cat=1316,1381,1392&sort=sort_totalsales15_desc&trans=1&page=85&JL=6_0_0#J_main"]

    def start_requests(self):
        for url_str in self.urls:
            yield Request(url=url_str, callback=self.parse, meta={"page": "0"}, dont_filter=True)

    def parse(self, response):
        # with open("jd.html","wb") as f:
        #     f.write(response.body)
        item = jdproItem()
        li_list = response.css('#plist > ul > li')
        page_next = response.css('#J_bottomPage > span.p-num > a.pn-next')
        print("li_list is :::::: ", li_list)
        for li in li_list:
            try:
                goods_name = li.xpath(r'./div/div/a/em/text()')[0].extract().strip("\n\t ")
                if goods_name == "":
                    goods_name = li.xpath(r'./div/div/a/em/text()')[1].extract().strip("\n\t ")
            except Exception as e:
                print(e)
            try:
                goods_price = li.xpath(r'.//div[@class="p-price"]/strong/i/text()')[0].extract()
            except Exception as e:
                print(e)
                goods_price = "暂无价格"
            try:
                goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@src')[0].extract()
            except Exception as e:
                print(e)
                goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@data-lazy-img')[0].extract()
            try:
                platfrom = li.xpath('.//div[contains(@class,"p-commit")]/strong/a/text()')[0].extract()
            except Exception as e:
                platfrom = "暂无"
            try:
                sales = li.xpath('.//div[@class="p-shop"]/span/a/text()')[0].extract().strip(".")
            except Exception as e:
                print(e)
                sales = "暂无"
            item["goods_name"] = goods_name
            item["goods_price"] = goods_price
            item["goods_img"] = goods_img
            item["platfrom"] = platfrom
            item["sales"] = sales
            yield item

        global num
        if len(page_next) > 0:
            num += 1
            if num < 260:
                print("开始爬取第{}页".format(num))
                yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)
            else:
                print("数据爬取完毕")

settings.py


USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0'

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
   # 'jdpro.middlewares.JdproDownloaderMiddleware': 543,
   'jdpro.middlewares.SeleniumMiddleware': 543,
}

ITEM_PIPELINES = {
   # 'jdpro.pipelines.writeFilesPipeline': 300,
   'jdpro.pipelines.saveSqlitePipeline': 301,
   # 'jdpro.pipelines.saveMysqlPipeline': 302,
   # 'jdpro.pipelines.saveMongodbPipeline': 303,
}

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 写入文件
import json


class writeFilesPipeline(object):

    def open_spider(self, spider):
        self.fp = open("data.txt", "w", encoding="utf8")

    def close_spider(self, spider):
        self.fp.close()

    def process_item(self, item, spider):
        dic = dict(item)
        string = json.dumps(dic, ensure_ascii=False)
        self.fp.write(string + "\n")
        return item


# 存入sqlite数据库
import sqlite3


class saveSqlitePipeline(object):

    def open_spider(self, spider):
        # 连接数据库
        self.conn = sqlite3.connect("Goods.db")

    def close_spider(self, spider):
        # 关闭数据库
        self.conn.close()

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
            item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
        # 执行sql语句
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("数据插入失败...请等待")
            print(e)
            self.conn.rollback()
        return item


# 存入mysql数据库
import pymysql


class saveMysqlPipeline(object):
    def open_spider(self, spider):
        # 连接数据库
        self.conn = pymysql.Connect(host="xxxxxx", port="3306", user="root", password="xxxxxx", database="xxxxxx",
                                    charset="utf8")

    def colse_spider(self, spider):
        # 关闭数据库
        self.conn.close()

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()

        sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
            item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("数据插入失败...请等待")
            print(e)
            self.conn.rollback()
        return item


# 存入mongodb数据库
import pymongo


class saveMongodbPipeline(object):
    def open_spider(self, spider):
        # 连接数据库
        self.client = pymongo.MongoClient(host="localhost", port=27017)

    def close_spider(self, spider):
        # 关闭数据库
        self.client.close()

    def process_item(self, item, spider):
        # 选择数据库
        db = self.client.job51
        # 选择集合
        col = db.job51
        # #将item转化为字典
        print(item)
        dic = dict(item)

        col.insert(dic)

        return item

middlewares.py

import time
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options


class SeleniumMiddleware(object):
    def __init__(self):
        self.options = Options()
        self.options.add_argument("--headless")
        self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",
                                        chrome_options=self.options
                                        )

    def process_request(self, request, spider):
        if int(request.meta["page"]) == 2:
            next_page = self.browser.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
            next_page.click()
            self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(10)
            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
                                request=request)
        else:
            try:
                print("url is :::::", request.url)
                self.browser.get(request.url)
                self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            except TimeoutError as e:
                print("超时")
            time.sleep(10)
            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
                                request=request)