欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy爬虫

程序员文章站 2022-05-06 19:08:27
...

spider

import json

import scrapy

from .. items import AidouItem


class AiDouSpider(scrapy.Spider):
    name = "aidou"

    def start_requests(self):
        channl_url = "http://static.idol001.com/userquanzi/type_list/main.json"
        yield scrapy.Request(channl_url, callback=self.channl_parsing)

    def channl_parsing(self, rep):
        channl_list = json.loads(rep.text).get("list")
        for channl in channl_list:
            quanzi_url = "http://static.idol001.com/userquanzi/type_list/%s.json" % channl.get("typeid")
            yield scrapy.Request(quanzi_url, callback=self.quanzi_parsing)

    def quanzi_parsing(self, rep):

        quanzi_list = json.loads(rep.text).get("list")
        for quanzi in quanzi_list:
            for page in range(10000):
                article_url = "http://data.idol001.com/api_moblie_idol_userquanzi.php?action=get_message_list&order=recom_time&app_platform=android&version=250&qzid=%s&count=10&page=%s" % (
                    quanzi.get("_id"), page)
                yield scrapy.Request(article_url, callback=self.article_parsing,
                                     cb_kwargs={'quanzi_id': quanzi.get("_id")})

    def article_parsing(self, rep, quanzi_id):
        if len(json.loads(rep.text).get("list")) > 0:
            article_list = json.loads(rep.text).get("list")
            for article in article_list:
                article_details_url = "https://data.idol001.com/api_moblie_idol_userquanzi.php?action=get_message_comment_list&messageid=%s&qzid=%s&order=time&page=1&version=250&app_platform=android" % (
                    article.get("_id"), quanzi_id)
            yield scrapy.Request(article_details_url, callback=self.article_details_parsing)

    def article_details_parsing(self, rep):
        article_details_list = json.loads(rep.text).get("list")
        for article_details in article_details_list:
            user_info_url = "http://data.idol001.com/api_moblie_idol.php?action=get_userinfo_detail&userid=%s" % article_details.get(
                "userid")
            yield scrapy.Request(user_info_url, callback=self.user_info_parsing)

    def user_info_parsing(self, rep):
        item = AidouItem()
        rep.encode = 'utf8'
        item["userinfo"] = json.loads(rep.text)
        yield item

pip:

import csv
import os

from itemadapter import ItemAdapter


class AidouPipeline:
    def __init__(self):
        store_file = os.path.dirname(__file__) + '/spiders/aidou_user_info.csv'
        self.file = open(store_file, 'w',newline='')
        self.fieldnames = ['_id', 'nickname', 'sex', 'location_str', 'favorite_star', 'score_left', 'care_num',
                           'location_first', 'abnormal',
                           'level_img_v6', 'admin_title', 'bi_follow', 'live_all_num', 'score',
                           'level_img', 'weibo_url', 'sina_uid',
                           'fans_num', 'register_time', 'last_login_time', 'publish_num', 'user_type', 'sys_time',
                           'next_level_score', 'birthday', 'birthday_str', 'sign', 'admin_score',
                           'zone', 'location_second', 'level', 'admin_title_cn', 'image', 'idol_num', 'vip_expire_time',
                           'background_img', 'vip_expire_time_str', 'is_vip']
        self.writer = csv.DictWriter(f=self.file, fieldnames=self.fieldnames)
        self.writer.writeheader()

    def process_item(self, item, spider):
        self.writer.writerow(item["userinfo"])
        return item

    def close_spider(self, spider):
        self.file.close()

satrt

from scrapy.cmdline import execute
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# 参数
parms = "keywords"
# 要执行的spider
Spider = ""
process = CrawlerProcess(get_project_settings())
process.crawl(Spider, parms)
process.start()
相关标签: python爬虫