python爬取玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中

程序员文章站 2024-02-28 18:41:34

...

1、创建Scrapy项目

scrapy startproject ExGrain

2.进入项目目录，使用命令genspider创建Spider

scrapy genspider exgrain ex-grain.cn

3、定义要抓取的数据（处理items.py文件）

# -*- coding: utf-8 -*-
import scrapy

class ExgrainItem(scrapy.Item):
    # 文章的目录
    news_path = scrapy.Field()

    # 文章的分类
    news_cate = scrapy.Field()
    # 文章标题
    news_title = scrapy.Field()
    # 文章发布日期
    news_date = scrapy.Field()
    # 文章来源
    news_source = scrapy.Field()
    # 文章导读
    news_guide = scrapy.Field()
    # 文章内容
    news_content = scrapy.Field()
    # 文章链接
    news_url = scrapy.Field()

4、编写提取item数据的Spider（在spiders文件夹下：exgrain.py）

# -*- coding: utf-8 -*-
# 爬取中国谷物网玉米、小麦、水稻信息数据到本地为网页形式和mysql数据库中，偶尔出现抓取数据不准确的情况
import scrapy
from ExGrain.items import ExgrainItem
import re
import os
import requests
from bs4 import BeautifulSoup
import time

class ExgrainSpider(scrapy.Spider):
    name = 'exgrain'
    allowed_domains = ['ex-grain.cn']
    # 玉米、小麦、稻米信息
    start_urls = ['http://www.ex-grain.cn/xxfb/list.htm?type=010301','http://www.ex-grain.cn/xxfb/list.htm?type=010302',
                  'http://www.ex-grain.cn/xxfb/list.htm?type=010201']
    url = "http://www.ex-grain.cn"

    def parse(self, response):
        items = []
        # 获取下一页
        next_url = response.xpath('//tr/td[@class="grayr"]/a/@href').extract()
        news_url = response.xpath('//tr/td/a[@class="new List"]/@href').extract()
        for i in range(len(news_url)):
            item = ExgrainItem()
            item['news_url'] = self.url + news_url[i]
            items.append(item)
        for item in items:
            time.sleep(2)
            yield scrapy.Request(url=item['news_url'], meta={'meta_1': item}, callback=self.parse_news)
        # 处理下一页
        for url in next_url:
            full_url = self.url + url
            yield scrapy.Request(url=full_url, callback=self.parse)

    def parse_news(self, response):
        item = ExgrainItem()
        # 提取每次Response的meta数据
        meta_1 = response.meta['meta_1']
        # 获取文章标题,有空格
        news_title = response.xpath('//tr/td[@class="h13"]/span/text()').extract()[0].replace(" ", "")
        # print("news_title_1",news_title)
        item['news_title'] = news_title
        # 获取文章来源,需要处理数据:发布时间：2018-07-18 10:54:46  |来源：  |作者：
        source_list = response.xpath('//tr[2]/td[@class="h3"]/text()').extract()[0]
        # 获取来源后的字段
        source = source_list.split("|")[1][3:].strip()
        if source == "":
            item["news_source"] = "中国谷物网"
        else:
            item["news_source"] = source
        # 获取发布时间：2018-07-18
        news_date = source_list.split("：")[1].split(" ")[0]
        html = requests.get(meta_1['news_url'])
        # 正则匹配文章内容
        patt = re.compile(r'<td style="width:890px;display:block;word-break:(.*) align="left">(.*)')
        # 匹配结果
        result = patt.search(html.text)
        # 获取文章内容
        news_content = result.group(2)
        # 将文字内容结果字体改变成微软雅黑
        item['news_content'] = news_content.replace('宋体', '微软雅黑').replace('仿宋','微软雅黑').replace('Courier New','微软雅黑')
        # 获取文章导读,只获取文章内容的一部分
        soup = BeautifulSoup(html.text, "lxml")
        content_list = []
        for i in soup.select("p"):
            content_list.append(i.get_text())
        # 将列表连接起来并去掉首尾空格
        news_guide_list = "".join(content_list).replace(" ", "")
        # 如果文章内容是以"<p>&nbsp;</p><table"开头的，文章可能是表格，导读就是文章标题
        if news_content[:19] == "<p>&nbsp;</p><table":
            news_guide = news_title
        else:
            if len(news_guide_list[:70]) != 0:
                news_guide = news_guide_list[:70].replace("\xa0", "") + "......"
            else:
                news_guide = news_guide_list.replace("\xa0", "")
        item['news_guide'] = news_guide
        item['news_date'] = news_date
        # 判断属于哪个类目
        # 小麦类目
        wheat_news_url = "http://www.ex-grain.cn/island/FX_010302"
        wheat_if_belong = meta_1['news_url'].startswith(wheat_news_url)
        # 玉米类目
        corn_news_url = "http://www.ex-grain.cn/island/FX_010301"
        corn_if_belong = meta_1['news_url'].startswith(corn_news_url)
        # 水稻类目
        rice_news_url = "http://www.ex-grain.cn/island/FX_010201"
        rice_if_belong = meta_1['news_url'].startswith(rice_news_url)
        if wheat_if_belong:
            item['news_cate'] = '小麦'
            news_path = "./Data/小麦/" + news_date + "/" + news_title
            # 如果目录不存在则创建
            if (not os.path.exists(news_path)):
                os.makedirs(news_path)
            item['news_path'] = news_path
            print("处理数据:%s" % (news_path[7:]))
        elif corn_if_belong:
            item['news_cate'] = '玉米'
            news_path = "./Data/玉米/" + news_date + "/" + news_title
            # 如果目录不存在则创建
            if (not os.path.exists(news_path)):
                os.makedirs(news_path)
            item['news_path'] = news_path
            print("处理数据:%s" % (news_path[7:]))
        elif rice_if_belong:
            item['news_cate'] = '水稻'
            news_path = "./Data/水稻/" + news_date + "/" + news_title
            # 如果目录不存在则创建
            if (not os.path.exists(news_path)):
                os.makedirs(news_path)
            item['news_path'] = news_path
            print("处理数据:%s" % (news_path[7:]))
        item['news_url'] = meta_1['news_url']
        yield item

5.处理pipelines管道文件保存数据，可将结果保存到文件中（pipelines.py）

# -*- coding: utf-8 -*-
import json

# 转码操作，继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, bytes):
            return str(o, encoding='utf-8')
        return json.JSONEncoder.default(self, o)

class ExgrainPipeline(object):
    def process_item(self, item, spider):
        self.fail_count = 0
        try:
            file_name = item['news_title']
            with open(item['news_path'] + "/" + file_name + ".html", "w+")as f:
                f.write(item['news_content'])
        except:
            self.fail_count += 1
            print("%s文件保存失败，请注意！"%item['news_title'])
            self.file_name_fail = item['news_title']
            with open(item['news_path'] + "/" + "[失败！]/"+self.file_name_fail + ".html", "w+")as f:
                f.write("<p>写入失败！</p>")
        return item

    def close_spider(self, spider):
        if self.fail_count != 0:
            print("%s文件保存失败了..."%self.file_name_fail)
        print("数据保存本地处理完毕，谢谢使用！")

6.增加ExGrainpipelines.py文件，同时将数据保存到mysql数据库中

# -*- coding: utf-8 -*-
import json
import pymysql
# 转码操作，继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, bytes):
            return str(o, encoding='utf-8')
        return json.JSONEncoder.default(self, o)

class DBPipeline(object):
    def __init__(self):
        # 连接数据库
        self.connect = pymysql.connect(
            host='localhost',
            port=3306,
            db='python3',
            user='root',
            passwd='123456',
            charset='utf8',
            use_unicode=True)
        # 通过cursor执行增删查改
        self.cursor = self.connect.cursor()
        # 来个计数器，统计写入了多少
        self.count = 0

    # @classmethod
    # def from_settings(cls, settings):
    #     dbargs = dict(
    #         host=settings['MYSQL_HOST'],
    #         db=settings['MYSQL_DBNAME'],
    #         user=settings['MYSQL_USER'],
    #         passwd=settings['MYSQL_PASSWD'],
    #         port=settings['MYSQL_PORT'],
    #         charset='utf8',
    #         cursorclass=pymysql.cursors.DictCursor,
    #         use_unicode=True,
        # )
        # dbpool = adbapi.ConnectionPool('pymysql', **dbargs)
        # return cls(dbpool)


    # def __init__(self,dbpool):
    #     self.dbpool=dbpool
    def process_item(self, item, spider):
        try:
            # 查重处理
            self.cursor.execute(
                """SELECT news_url FROM exgrain WHERE news_url = %s""",item['news_url'])
            # 是否有重复数据
            repetition = self.cursor.fetchone()
            # 重复
            if repetition:
                print("数据库已有此条数据，不再添加",repetition[0])
            else:
                print("写入数据库中...")
                # 插入数据
                self.cursor.execute(
                    """INSERT INTO exgrain(news_cate,news_title, news_date, news_source, news_guide ,
                      news_content, news_url)VALUES(%s,%s, %s, %s, %s, %s, %s)""",
                    (item['news_cate'],item['news_title'],item['news_date'],item['news_source'],
                     item['news_guide'],item['news_content'],item['news_url']))
                self.count += 1
            # 提交sql语句
            self.connect.commit()
        except Exception as error:
            # 出现错误时打印错误日志
            log(error)
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.connect.close()
        print("数据库处理完毕，本次共计增加%d条数据，谢谢使用！"%self.count)

7.配置settings文件（settings.py，调用数据库成功例子：https://blog.csdn.net/z564359805/article/details/81561912）

# Obey robots.txt rules，具体含义参照：https://blog.csdn.net/z564359805/article/details/80691677      
ROBOTSTXT_OBEY = False 

# # 将数据保存在mysql
# MYSQL_HOST = 'localhost'
# MYSQL_DBNAME = 'python3'
# MYSQL_USER = 'root'
# MYSQL_PASSWD = '123456'
# MYSQL_PORT = 3306
 
 
# 下载延迟
DOWNLOAD_DELAY = 4 
# Override the default request headers:添加User-Agent信息      
DEFAULT_REQUEST_HEADERS = {      
  'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',      
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
  # 'Accept-Language': 'en',      
}  
      
# Configure item pipelines去掉下面注释，打开管道文件      
ITEM_PIPELINES = {
   'ExGrain.pipelines.ExgrainPipeline': 100,
   'ExGrain.ExGrainpipelines.DBPipeline': 300,

}
      
# 还可以将日志存到本地文件中（可选添加设置）      
LOG_FILE = "exgrain.log"      
LOG_LEVEL = "DEBUG" 
# 包含打印信息也一起写进日志里
LOG_STDOUT = True

8.记得提前打开mysql数据库，并且建立好相应的表

# 创建谷物网文章的数据库表
CREATE TABLE exgrain(id int PRIMARY KEY auto_increment not null,news_cate varchar(2),news_title varchar(100),news_date date,
news_source varchar(30),news_guide VARCHAR(150),news_content MEDIUMTEXT,news_url VARCHAR(90));

9.以上设置完毕，进行爬取：执行项目命令crawl，启动Spider：

scrapy crawl exgrain

PS:（偶尔出现抓取文章标题或者文章内容不准确的情况，一直未解决，网站本身刷新的时候数据会改变，不知道怎么解决？）