Python爬虫实战:手把手教你爬取农产品数据(附代码)
前言
文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
爬虫的网站:万邦国际集团。其成立于2010年,总部位于河南省郑州市,以“立足三农、保障民生、服务全国”为宗旨,业务涵盖综合性农产品冷链物流、高效生态农业开发、生鲜连锁超市、跨境电子商务、进出口贸易等农业全产业链。荣获重点龙头企业、全国农产品“综合十强市场”、“星创天地”、全国“万企帮万村”精准扶贫先进民营企业等荣誉称号。目前,集团在中牟县建设运营的万邦农产品物流园区,已累计完成投资100亿元,占地5000亩,建筑面积达350万平方米。拥有固定商户6000多家,2017年各类农副产品交易额913亿元,交易量1720万吨,位居全国前列,实现农产品“买全球、卖全国”。
其价格信息查询为get请求,网页比较规范,且短期内不会有大的变动,很容易分析,故选择之。
一、使用request爬取数据
# _*_ coding:utf-8 _*_ # 开发人员:未央 # 开发时间:2020/4/12 16:03 # 文件名:scrapy_lab1.py # 开发工具:pycharm import csv import codecs import requests # 导入requests包 from bs4 import beautifulsoup # 导入bs4包 from datetime import datetime class produce: price_data = [] # 农产品的价格数据列表 item_name = "" # 农产品的类别名 def __init__(self, category): self.item_name = category self.price_data = [] # 读取某一页的数据,默认是第一页 def get_price_page_data(self, page_index=1): url = 'http://www.wbncp.com/pricequery.aspx?pageno=' + str( page_index) + '&itemname=' + self.item_name + '&datestart=2017/10/1&dateend=2020/3/31 ' strhtml = requests.get(url) # get方式,获取网页数据 # print(strhtml.text) soup = beautifulsoup(strhtml.text, 'html.parser') # 解析网页文档 # print(soup) table_node = soup.find_all('table') # number = 0 # for table in table_node: # number += 1 # print(number, table) all_price_table = table_node[21] # 获取含有农产品价钱的table的数据 # print(all_price_table) for tr in all_price_table.find_all('tr'): number = 0 price_line = [] for td in tr.find_all('td'): number += 1 # print(number, td) if number == 1: price_line.append(td.get_text().split()) # 获取品名 elif number == 2: price_line.append(td.get_text().split()) # 获取产地 elif number == 3: price_line.append(td.get_text().split()) # 获取规格 elif number == 4: price_line.append(td.get_text().split()) # 获取单位 elif number == 5: price_line.append(td.get_text().split()) # 获取最高价 elif number == 6: price_line.append(td.get_text().split()) # 获取最低价 elif number == 7: price_line.append(td.get_text().split()) # 获取均价 elif number == 8: price_line.append(datetime.strptime(td.get_text().replace('/', '-'), '%y-%m-%d')) # 获取日期 self.price_data.append(price_line) return # 获取全部页面的数据 def get_price_data(self): for i in range(33): self.get_price_page_data(str(i)) return # 讲爬虫的数据写入到csv文件,路径为:d:\data_pytorch\名字.csv def data_write_csv(self): # file_address为写入csv文件的路径,self.price_data为要写入数据列表 self.get_price_data() file_address = "d:\data_pytorch\\" + self.item_name.__str__() + ".csv" file_csv = codecs.open(file_address, 'w+', 'utf-8') # 追加 writer = csv.writer(file_csv, delimiter=' ', quotechar=' ', quoting=csv.quote_minimal) for temp_data in self.price_data: writer.writerow(temp_data) print(self.item_name + "爬虫数据保存到文件成功!") # 以字典类型读取csv文件,读取路径为:d:\data_pytorch\名字.csv def data_reader_csv(self): file_address = "d:\data_pytorch\\" + self.item_name.__str__() + ".csv" with open(file_address, 'r', encoding='utf8')as fp: # 使用列表推导式,将读取到的数据装进列表 data_list = [i for i in csv.dictreader(fp, fieldnames=none)] # csv.dictreader 读取到的数据是list类型 print(self.item_name + "数据如下:") print(data_list) return data_list list = ["白菜", "包菜", "土豆", "菠菜", "蒜苔"] for temp_name in list: produce = produce(temp_name) produce.data_write_csv() data = produce.data_reader_csv()
运行之后,文件显示内容如下:
二、使用scrapy爬取数据
类似之前的学习案例,这里不再一步一步的介绍,直接上代码:
items.py代码如下:
# -*- coding: utf-8 -*- # define here the models for your scraped items # # see documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader import itemloader from scrapy.loader.processors import takefirst class pricespideritemloader(itemloader): # 自定义itemloader,用于存储爬虫所抓取的字段内容的 default_output_processor = takefirst() class pricespideritem(scrapy.item): # define the fields for your item here like: # name = scrapy.field() name = scrapy.field() # 品名 address = scrapy.field() # 产地 norms = scrapy.field() # 规格 unit = scrapy.field() # 单位 high = scrapy.field() # 最高价 low = scrapy.field() # 最低价 price_ave = scrapy.field() # 均价 price_date = scrapy.field() # 日期
setting.py代码如下:
# -*- coding: utf-8 -*- # scrapy settings for price_spider project # # for simplicity, this file contains only settings considered important or # commonly used. you can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy.exporters import jsonlinesitemexporter # 默认显示的中文是阅读性较差的unicode字符 # 需要定义子类显示出原来的字符集(将父类的ensure_ascii属性设置为false即可) class customjsonlinesitemexporter(jsonlinesitemexporter): def __init__(self, file, **kwargs): super(customjsonlinesitemexporter, self).__init__(file, ensure_ascii=false, **kwargs) # 启用新定义的exporter类 feed_exporters = { 'json': 'price_spider.settings.customjsonlinesitemexporter', } bot_name = 'price_spider' spider_modules = ['price_spider.spiders'] newspider_module = 'price_spider.spiders' # crawl responsibly by identifying yourself (and your website) on the user-agent # user_agent = 'price_spider (+http://www.yourdomain.com)' # obey robots.txt rules robotstxt_obey = false # configure maximum concurrent requests performed by scrapy (default: 16) # concurrent_requests = 32 # configure a delay for requests for the same website (default: 0) # see https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # see also autothrottle settings and docs download_delay = 3
爬虫逻辑(spider.py)代码如下:
# _*_ coding:utf-8 _*_ # 开发人员:未央 # 开发时间:2020/4/16 14:55 # 文件名:spider.py # 开发工具:pycharm import scrapy from price_spider.items import pricespideritemloader, pricespideritem class spiderspider(scrapy.spider): name = 'spider' allowed_domains = ['www.wbncp.com'] start_urls = ['http://www.wbncp.com/pricequery.aspx?pageno=1&itemname=%e7%99%bd%e8%8f%9c&datestart=2017/10/1' '&dateend=2020/3/31', 'http://www.wbncp.com/pricequery.aspx?pageno=1&itemname=土豆&datestart=2017/10/1' '&dateend=2020/3/31', 'http://www.wbncp.com/pricequery.aspx?pageno=1&itemname' '=芹菜&datestart=2017/10/1 &dateend=2020/3/31'] def parse(self, response): item_nodes = response.xpath("//tr[@class='center' or @class='center gray']") for item_node in item_nodes: item_loader = pricespideritemloader(item=pricespideritem(), selector=item_node) item_loader.add_css("name", "td:nth-child(1) ::text") item_loader.add_css("address", "td:nth-child(2) ::text") item_loader.add_css("norms", "td:nth-child(3) ::text") item_loader.add_css("unit", "td:nth-child(4) ::text") item_loader.add_css("high", "td:nth-child(5) ::text") item_loader.add_css("low", "td:nth-child(6) ::text") item_loader.add_css("price_ave", "td:nth-child(7)::text") item_loader.add_css("price_date", "td:nth-child(8)::text") price_item = item_loader.load_item() yield price_item next_page = response.xpath("//*[@id='cphright_lblpage']/div/a[10]/@href").extract_first() if next_page is not none: next_page = response.urljoin(next_page) yield scrapy.request(next_page, callback=self.parse)
替代运行命令(price_scrapy_main.py)的代码如下:
# _*_ coding:utf-8 _*_ # 开发人员:未央 # 开发时间:2020/4/16 14:55 # 文件名:price_scrapy_main.py # 开发工具:pycharm from scrapy.cmdline import execute execute(["scrapy", "crawl", "spider", "-o", "price_data.csv"])
运作后,将csv数据导入excel中,结果如下:
三、经验总结:
1.使用request确实比较灵活,但是如果爬取数据多很不方便,代码也会很长,还是使用scrapy方便。特别是爬取多个页面,scrapy 的横向和纵向爬取,超级腻害!
2.scrapy主要是设置文件(setting.py)的各种设置以及爬虫文件(本文是spider.py)的爬虫逻辑,其中主要是选择器部分比较麻烦
如果你处于想学python或者正在学习python,python的教程不少了吧,但是是最新的吗?说不定你学了可能是两年前人家就学过的内容,在这小编分享一波2020最新的python教程。获取方式,私信小编 “ 资料 ”,即可免费获取哦!