scrapy爬虫
程序员文章站
2022-05-06 19:08:27
...
spider
import json
import scrapy
from .. items import AidouItem
class AiDouSpider(scrapy.Spider):
name = "aidou"
def start_requests(self):
channl_url = "http://static.idol001.com/userquanzi/type_list/main.json"
yield scrapy.Request(channl_url, callback=self.channl_parsing)
def channl_parsing(self, rep):
channl_list = json.loads(rep.text).get("list")
for channl in channl_list:
quanzi_url = "http://static.idol001.com/userquanzi/type_list/%s.json" % channl.get("typeid")
yield scrapy.Request(quanzi_url, callback=self.quanzi_parsing)
def quanzi_parsing(self, rep):
quanzi_list = json.loads(rep.text).get("list")
for quanzi in quanzi_list:
for page in range(10000):
article_url = "http://data.idol001.com/api_moblie_idol_userquanzi.php?action=get_message_list&order=recom_time&app_platform=android&version=250&qzid=%s&count=10&page=%s" % (
quanzi.get("_id"), page)
yield scrapy.Request(article_url, callback=self.article_parsing,
cb_kwargs={'quanzi_id': quanzi.get("_id")})
def article_parsing(self, rep, quanzi_id):
if len(json.loads(rep.text).get("list")) > 0:
article_list = json.loads(rep.text).get("list")
for article in article_list:
article_details_url = "https://data.idol001.com/api_moblie_idol_userquanzi.php?action=get_message_comment_list&messageid=%s&qzid=%s&order=time&page=1&version=250&app_platform=android" % (
article.get("_id"), quanzi_id)
yield scrapy.Request(article_details_url, callback=self.article_details_parsing)
def article_details_parsing(self, rep):
article_details_list = json.loads(rep.text).get("list")
for article_details in article_details_list:
user_info_url = "http://data.idol001.com/api_moblie_idol.php?action=get_userinfo_detail&userid=%s" % article_details.get(
"userid")
yield scrapy.Request(user_info_url, callback=self.user_info_parsing)
def user_info_parsing(self, rep):
item = AidouItem()
rep.encode = 'utf8'
item["userinfo"] = json.loads(rep.text)
yield item
pip:
import csv
import os
from itemadapter import ItemAdapter
class AidouPipeline:
def __init__(self):
store_file = os.path.dirname(__file__) + '/spiders/aidou_user_info.csv'
self.file = open(store_file, 'w',newline='')
self.fieldnames = ['_id', 'nickname', 'sex', 'location_str', 'favorite_star', 'score_left', 'care_num',
'location_first', 'abnormal',
'level_img_v6', 'admin_title', 'bi_follow', 'live_all_num', 'score',
'level_img', 'weibo_url', 'sina_uid',
'fans_num', 'register_time', 'last_login_time', 'publish_num', 'user_type', 'sys_time',
'next_level_score', 'birthday', 'birthday_str', 'sign', 'admin_score',
'zone', 'location_second', 'level', 'admin_title_cn', 'image', 'idol_num', 'vip_expire_time',
'background_img', 'vip_expire_time_str', 'is_vip']
self.writer = csv.DictWriter(f=self.file, fieldnames=self.fieldnames)
self.writer.writeheader()
def process_item(self, item, spider):
self.writer.writerow(item["userinfo"])
return item
def close_spider(self, spider):
self.file.close()
satrt
from scrapy.cmdline import execute
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# 参数
parms = "keywords"
# 要执行的spider
Spider = ""
process = CrawlerProcess(get_project_settings())
process.crawl(Spider, parms)
process.start()