item中定义获取的字段和原始数据进行处理并合法化数据
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import scrapy
import hashlib
import re
from scrapy.loader.processors import (MapCompose, TakeFirst, Join)
from scrapy.loader import ItemLoader
def go_md5(value):
# 对cont_url进行md5,作为该表的主键
m = hashlib.md5()
if isinstance(value, str):
m.update(bytes(value, encoding='utf-8'))
# print(type(m.hexdigest()))
return m.hexdigest()
def go_time(value):
# 获取时间,并且格式化时间,raw_t为原始数据,new_t为符合mysql中data类型数据
raw_t = value.strip()
if raw_t:
median_t = raw_t.replace('·', '')
if median_t:
time_l = median_t.split('/')
new_t = '-'.join(time_l)
return new_t.strip()
return median_t
else:
return raw_t
def go_cont(value):
# 把文章内容中换行和空格去掉
return value.strip()
def go_img(value):
# 确定图片下载器获取的是列表,下载器获取的图片url对象为列表形式
return value
def get_num(value):
# 获取评论、点赞、收藏数
num = re.match(r'.*?(\d+).*?', value)
if num:
return int(num.group(1))
else:
return 0
class ArticleItemLoader(ItemLoader):
"""
自定义ItemLoader,要求取每个字段列表中第一个值
"""
default_output_processor = TakeFirst()
class JobboleItem(scrapy.Item):
"""
input_processor 数据预处理
output_processor 数据返回item数据处理
"""
cont_id = scrapy.Field(
input_processor=MapCompose(go_md5)
)
cont_url = scrapy.Field() #
title = scrapy.Field()
publish_time = scrapy.Field(
input_processor=MapCompose(go_time)
)
cont = scrapy.Field(
input_processor=MapCompose(go_cont),
output_processor=Join('')
)
img_url = scrapy.Field(
output_processor=MapCompose(go_img)
)
link_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
collection_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
comment_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
img_path = scrapy.Field()
# 测试
if __name__ == '__main__':
result = get_num(' s ss 14 ssss')
print(result)