欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

使用scrapy框架,用模拟浏览器的方法爬取京东上面膜信息,并存入mysql,sqlite,mongodb数据库

程序员文章站 2022-03-02 23:24:47
...

因为京东的页面是由JavaScript动态加载的所以使用模拟浏览器的方法进行爬取,具体代码如下 :

spider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from jdpro.items import jdproItem

num = 0
class MaskSpider(scrapy.Spider):
    name = 'mask'
    allowed_domains = ['list.jd.com']

    def __init__(self):
        self.urls = [
            "https://list.jd.com/list.html?cat=1316,1381,1392&sort=sort_totalsales15_desc&trans=1&page=85&JL=6_0_0#J_main"]

    def start_requests(self):
        for url_str in self.urls:
            yield Request(url=url_str, callback=self.parse, meta={"page": "0"}, dont_filter=True)

    def parse(self, response):
        # with open("jd.html","wb") as f:
        #     f.write(response.body)
        item = jdproItem()
        li_list = response.css('#plist > ul > li')
        page_next = response.css('#J_bottomPage > span.p-num > a.pn-next')
        print("li_list is :::::: ", li_list)
        for li in li_list:
            try:
                goods_name = li.xpath(r'./div/div/a/em/text()')[0].extract().strip("\n\t ")
                if goods_name == "":
                    goods_name = li.xpath(r'./div/div/a/em/text()')[1].extract().strip("\n\t ")
            except Exception as e:
                print(e)
            try:
                goods_price = li.xpath(r'.//div[@class="p-price"]/strong/i/text()')[0].extract()
            except Exception as e:
                print(e)
                goods_price = "暂无价格"
            try:
                goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@src')[0].extract()
            except Exception as e:
                print(e)
                goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@data-lazy-img')[0].extract()
            try:
                platfrom = li.xpath('.//div[contains(@class,"p-commit")]/strong/a/text()')[0].extract()
            except Exception as e:
                platfrom = "暂无"
            try:
                sales = li.xpath('.//div[@class="p-shop"]/span/a/text()')[0].extract().strip(".")
            except Exception as e:
                print(e)
                sales = "暂无"
            item["goods_name"] = goods_name
            item["goods_price"] = goods_price
            item["goods_img"] = goods_img
            item["platfrom"] = platfrom
            item["sales"] = sales
            yield item

        global num
        if len(page_next) > 0:
            num += 1
            if num < 260:
                print("开始爬取第{}页".format(num))
                yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)
            else:
                print("数据爬取完毕")

settings.py


USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0'

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
   # 'jdpro.middlewares.JdproDownloaderMiddleware': 543,
   'jdpro.middlewares.SeleniumMiddleware': 543,
}

ITEM_PIPELINES = {
   # 'jdpro.pipelines.writeFilesPipeline': 300,
   'jdpro.pipelines.saveSqlitePipeline': 301,
   # 'jdpro.pipelines.saveMysqlPipeline': 302,
   # 'jdpro.pipelines.saveMongodbPipeline': 303,
}

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 写入文件
import json


class writeFilesPipeline(object):

    def open_spider(self, spider):
        self.fp = open("data.txt", "w", encoding="utf8")

    def close_spider(self, spider):
        self.fp.close()

    def process_item(self, item, spider):
        dic = dict(item)
        string = json.dumps(dic, ensure_ascii=False)
        self.fp.write(string + "\n")
        return item


# 存入sqlite数据库
import sqlite3


class saveSqlitePipeline(object):

    def open_spider(self, spider):
        # 连接数据库
        self.conn = sqlite3.connect("Goods.db")

    def close_spider(self, spider):
        # 关闭数据库
        self.conn.close()

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
            item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
        # 执行sql语句
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("数据插入失败...请等待")
            print(e)
            self.conn.rollback()
        return item


# 存入mysql数据库
import pymysql


class saveMysqlPipeline(object):
    def open_spider(self, spider):
        # 连接数据库
        self.conn = pymysql.Connect(host="xxxxxx", port="3306", user="root", password="xxxxxx", database="xxxxxx",
                                    charset="utf8")

    def colse_spider(self, spider):
        # 关闭数据库
        self.conn.close()

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()

        sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
            item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print("数据插入失败...请等待")
            print(e)
            self.conn.rollback()
        return item


# 存入mongodb数据库
import pymongo


class saveMongodbPipeline(object):
    def open_spider(self, spider):
        # 连接数据库
        self.client = pymongo.MongoClient(host="localhost", port=27017)

    def close_spider(self, spider):
        # 关闭数据库
        self.client.close()

    def process_item(self, item, spider):
        # 选择数据库
        db = self.client.job51
        # 选择集合
        col = db.job51
        # #将item转化为字典
        print(item)
        dic = dict(item)

        col.insert(dic)

        return item

middlewares.py

import time
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options


class SeleniumMiddleware(object):
    def __init__(self):
        self.options = Options()
        self.options.add_argument("--headless")
        self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",
                                        chrome_options=self.options
                                        )

    def process_request(self, request, spider):
        if int(request.meta["page"]) == 2:
            next_page = self.browser.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
            next_page.click()
            self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(10)
            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
                                request=request)
        else:
            try:
                print("url is :::::", request.url)
                self.browser.get(request.url)
                self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            except TimeoutError as e:
                print("超时")
            time.sleep(10)
            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
                                request=request)

相关标签: scrapy爬虫