scrapy数据清洗:
程序员文章站
2022-05-08 16:57:37
...
scrapy数据清洗:
在爬取数据过程中,有些数据不是我们需要的,或者有的数据格式不符合我们的要求,需要进行处理然后在进行保存,传统的方法就是在items中定义我们需要的字段,例如:
class ShetuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_name = scrapy.Field()
img_url = scrapy.Field()
img_time = scrapy.Field()
img_looknum = scrapy.Field()
img_collect = scrapy.Field()
img_down = scrapy.Field()
然后在爬虫文件中
def xiangxi_parse(self,response):
item = ShetuItem()
#照片名字
img_name = response.xpath('//div[@class="photo-view"]/h1/text()')[0].extract()
#照片url
img_url = response.xpath('//div[@class="huabu"]/a/img/@src')[0].extract()
#照片发布时间
img_time = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]/text()')[0].extract()
#照片浏览量
img_looknum = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()')[0].extract()
# 收藏量
img_collect = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]/text()')[0].extract().strip("收藏")
#下载量
img_down = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]/text()')[1].extract().strip("\n\t下载")
for field in item.fields.keys(): # 取出所有的键
item[field] = eval(field)
yield item
假设我们需要对name,url等进行处理,在这里 可能就要许多判断,造成代码冗余,不美观,为了解决代码简洁的问题我们可以瞎用如下的方法,重新定义items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy import Field
from scrapy.loader.processors import MapCompose,TakeFirst
def str_convert(value):
return 'country_' + value
def get_nums(value):
return value.replace(',','')
class CountryItemLoader(ItemLoader):
#定义一个默认的输入处理器 全局的 TakeFirst()取出数组的第一个相当于extract_first
default_output_processor = TakeFirst()
class CountryproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field(
#的定义一个输入处理器,这里将处理器映射到函数中,进行数据清洗
input_processor = MapCompose(str_convert),
)
population = Field(
input_processor = MapCompose(get_nums),
)
然后在爬虫 文件中
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from countrypro.items import CountryproItem,CountryItemLoader
class CountrySpider(scrapy.Spider):
name = 'country'
allowed_domains = ['example.com']
def __init__(self):
self.urls = ['http://example.webscraping.com/places/default/view/China-47']
def start_requests(self):
for url_str in self.urls:
yield Request(url_str,callback=self.parse,dont_filter = True)
def parse(self, response):
item = CountryItemLoader(item = CountryproItem(),response=response)
item.add_css('title','tr#places_country__row td.w2p_fw::text')
# item.add_xpath()
item.add_css('population', 'tr#places_population__row td.w2p_fw::text')
return item.load_item()
上一篇: echo展示脚本结果
下一篇: C#程序调用cmd执行命令