简单的网络scrapy爬虫示例代码
程序员文章站
2022-03-02 22:15:56
...
# -*- coding: utf-8 -*-
import scrapy
# scrapy genspider itcast "itcast.cn"
# scrapy startproject mySpider
class ITcastItem(scrapy.Item):
name = scrapy.Field()
title = scrapy.Field()
info = scrapy.Field()
class ItcastSpider(scrapy.Spider):
name = 'itcast'
allowed_domains = ['itcast.cn']
start_urls = ["http://www.itcast.cn/channel/teacher.shtml", ]
# scrapy crawl itcast
# scrapy crawl itcast -o teachers.csv
# scrapy crawl itcast -o teachers.json
# scrapy crawl itcast -o teachers.jsonl
# scrapy crawl itcast -o teachers.xml
def parse(self, response):
filename = "teacher.html"
open(filename, 'wb').write(response.body)
context = response.xpath('/html/head/title/text()')
title = context.extract_first()
print(title)
# 存放老师信息的集合
items = []
for each in response.xpath("//div[@class='li_txt']"):
# 将我们得到的数据封装到一个 `ItcastItem` 对象
item = ITcastItem()
# extract()方法返回的都是unicode字符串
name = each.xpath("h3/text()").extract()
title = each.xpath("h4/text()").extract()
info = each.xpath("p/text()").extract()
# xpath返回的是包含一个元素的列表
item['name'] = name[0]
item['title'] = title[0]
item['info'] = info[0]
items.append(item)
# 直接返回最后数据
return items
上一篇: ecmascript 是什么
下一篇: javascript中的不等于怎么表示