利用Scrapy框架爬取博客园精选区前n页的博客题目、点赞数、链接
程序员文章站
2022-05-07 17:46:05
...
将数据写入数据库或生成*.xml文件
代码
#Scrapy框架模板
# import pymongo
import scrapy
# client = pymongo.MongoClient()
# mydb = client['mydb']
# cnblogs = mydb['cnblogs']
class CnBlogSpider(scrapy.Spider):
name = 'CnBlogSpider'
start_urls = ['https://www.cnblogs.com/pick']
# start_urls = ['https://www.cnblogs.com/pick/#p{}'.format(str(i)) for i in range(1, 6)]
def parse(self, response):
for blog in response.xpath('//div[@class="post_item"]'):
yield {
'标题': blog.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first(),
'链接': blog.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first(),
'点赞数': blog.xpath('div[@class="digg"]/div[@class="diggit"]/span/text()').extract_first()
}
# cnblogs.insert_one(infos)
# 根据博客园的Next位置制定的获取方式(第一页Next在a[13]的文本中,2-7页Next在a[14]的文本中,之后7-74页Next在a[15]文本中。。。。)
NextText = response.xpath('//*[@id="paging_block"]/div/a[15]/text()').extract_first()
if NextText is None:
NextText = response.xpath('//*[@id="paging_block"]/div/a[14]/text()').extract_first()
if NextText is None:
next_page = response.xpath('//*[@id="paging_block"]/div/a[13]/@href').extract_first()
else:
next_page = response.xpath('//*[@id="paging_block"]/div/a[14]/@href').extract_first()
else:
next_page = response.xpath('//*[@id="paging_block"]/div/a[15]/@href').extract_first()
if next_page == "/pick/11/": #可指定爬取到指定页
return
next_page = response.urljoin(next_page)#生成完整的链接
yield scrapy.Request(next_page, callback=self.parse)#回调自己进行函数解析
运行
代码完成后,进入程序所在文件,Shift +右键,选择在此打开命令窗口,
若要写入数据库,在命令台输入:scrapy runspider cnblogs.py
若生成指定文件y命令台输入:scrapy runspider cnblogs.p - o cnblogs.xml / cnblogs.csv