使用scrapy框架,用模拟浏览器的方法爬取京东上面膜信息,并存入mysql,sqlite,mongodb数据库
程序员文章站
2022-03-02 23:24:47
...
因为京东的页面是由JavaScript动态加载的所以使用模拟浏览器的方法进行爬取,具体代码如下 :
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from jdpro.items import jdproItem
num = 0
class MaskSpider(scrapy.Spider):
name = 'mask'
allowed_domains = ['list.jd.com']
def __init__(self):
self.urls = [
"https://list.jd.com/list.html?cat=1316,1381,1392&sort=sort_totalsales15_desc&trans=1&page=85&JL=6_0_0#J_main"]
def start_requests(self):
for url_str in self.urls:
yield Request(url=url_str, callback=self.parse, meta={"page": "0"}, dont_filter=True)
def parse(self, response):
# with open("jd.html","wb") as f:
# f.write(response.body)
item = jdproItem()
li_list = response.css('#plist > ul > li')
page_next = response.css('#J_bottomPage > span.p-num > a.pn-next')
print("li_list is :::::: ", li_list)
for li in li_list:
try:
goods_name = li.xpath(r'./div/div/a/em/text()')[0].extract().strip("\n\t ")
if goods_name == "":
goods_name = li.xpath(r'./div/div/a/em/text()')[1].extract().strip("\n\t ")
except Exception as e:
print(e)
try:
goods_price = li.xpath(r'.//div[@class="p-price"]/strong/i/text()')[0].extract()
except Exception as e:
print(e)
goods_price = "暂无价格"
try:
goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@src')[0].extract()
except Exception as e:
print(e)
goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@data-lazy-img')[0].extract()
try:
platfrom = li.xpath('.//div[contains(@class,"p-commit")]/strong/a/text()')[0].extract()
except Exception as e:
platfrom = "暂无"
try:
sales = li.xpath('.//div[@class="p-shop"]/span/a/text()')[0].extract().strip(".")
except Exception as e:
print(e)
sales = "暂无"
item["goods_name"] = goods_name
item["goods_price"] = goods_price
item["goods_img"] = goods_img
item["platfrom"] = platfrom
item["sales"] = sales
yield item
global num
if len(page_next) > 0:
num += 1
if num < 260:
print("开始爬取第{}页".format(num))
yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)
else:
print("数据爬取完毕")
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
# 'jdpro.middlewares.JdproDownloaderMiddleware': 543,
'jdpro.middlewares.SeleniumMiddleware': 543,
}
ITEM_PIPELINES = {
# 'jdpro.pipelines.writeFilesPipeline': 300,
'jdpro.pipelines.saveSqlitePipeline': 301,
# 'jdpro.pipelines.saveMysqlPipeline': 302,
# 'jdpro.pipelines.saveMongodbPipeline': 303,
}
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 写入文件
import json
class writeFilesPipeline(object):
def open_spider(self, spider):
self.fp = open("data.txt", "w", encoding="utf8")
def close_spider(self, spider):
self.fp.close()
def process_item(self, item, spider):
dic = dict(item)
string = json.dumps(dic, ensure_ascii=False)
self.fp.write(string + "\n")
return item
# 存入sqlite数据库
import sqlite3
class saveSqlitePipeline(object):
def open_spider(self, spider):
# 连接数据库
self.conn = sqlite3.connect("Goods.db")
def close_spider(self, spider):
# 关闭数据库
self.conn.close()
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
# 执行sql语句
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print("数据插入失败...请等待")
print(e)
self.conn.rollback()
return item
# 存入mysql数据库
import pymysql
class saveMysqlPipeline(object):
def open_spider(self, spider):
# 连接数据库
self.conn = pymysql.Connect(host="xxxxxx", port="3306", user="root", password="xxxxxx", database="xxxxxx",
charset="utf8")
def colse_spider(self, spider):
# 关闭数据库
self.conn.close()
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % (
item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales'])
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print("数据插入失败...请等待")
print(e)
self.conn.rollback()
return item
# 存入mongodb数据库
import pymongo
class saveMongodbPipeline(object):
def open_spider(self, spider):
# 连接数据库
self.client = pymongo.MongoClient(host="localhost", port=27017)
def close_spider(self, spider):
# 关闭数据库
self.client.close()
def process_item(self, item, spider):
# 选择数据库
db = self.client.job51
# 选择集合
col = db.job51
# #将item转化为字典
print(item)
dic = dict(item)
col.insert(dic)
return item
import time
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
def __init__(self):
self.options = Options()
self.options.add_argument("--headless")
self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",
chrome_options=self.options
)
def process_request(self, request, spider):
if int(request.meta["page"]) == 2:
next_page = self.browser.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
next_page.click()
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(10)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
request=request)
else:
try:
print("url is :::::", request.url)
self.browser.get(request.url)
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
except TimeoutError as e:
print("超时")
time.sleep(10)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
request=request)