scrapy爬取公交站
程序员文章站
2022-04-26 12:06:30
...
scrapy爬取公交站
1、settings
ITEM_PIPELINES = {
'test2.pipelines.Test2Pipeline': 300,
}
2、items
class Test2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
bus_name=scrapy.Field()
bus_type=scrapy.Field()
bus_time=scrapy.Field()
bus_cost=scrapy.Field()
bus_update=scrapy.Field()
3、spiders
from test2.items import Test2Item
import scrapy
import logging
from lxml import etree
import requests
import time
class Test2ItemSpider(scrapy.Spider):
name="test2Item"
city_name=input("请输入要爬取公交站地址的城市名称:--")
allowed_domains=['{}.8684.cn'.format(city_name)]
start_urls={
"https://{}.8684.cn/".format(city_name),
}
def parse(self,response):
# filename="car.html"
# open(filename,"wb").write(response.body)
items=[]
test2Item=Test2Item()
# /list1
a_list=response.xpath("//div[@class='bus-layer depth w120']/div[@class='pl10'][1]/div[@class='list']/a/@href").extract()
# https://lanzhou.8684.cn/list1
all_urls=[response.url+a.strip("/") for a in a_list]#url前缀列表
print('*' * 10)
print(response.url)
print('*' * 10)
# 获取详情页后缀,和详情信息
for a in all_urls:
tree=etree.HTML(requests.get(url=a).text)
detail_href= tree.xpath("//div[@class='list clearfix']/a/@href")
for detail in detail_href:
test2Item = Test2Item()
all_urls_1=response.url+detail.strip("/")
tree = etree.HTML(requests.get(url=all_urls_1).text)
# print('*' * 10)
time.sleep(2)
bus_name = tree.xpath("//div[@class='layout-left']/div[@class='bus-lzinfo mb20']//h1/text()")[0]
test2Item["bus_name"] = bus_name
#公交类型
bus_type = tree.xpath("//div[@class='layout-left']/div[@class='bus-lzinfo mb20']//h1/a/text()")[0]
test2Item["bus_type"]=bus_type
#公交时间
bus_time = tree.xpath("//div[@class='layout-left']/div[@class='bus-lzinfo mb20']//ul/li[1]/text()")[0].lstrip("运行时间:")
test2Item["bus_time"]=bus_time
#公交花费
bus_cost = tree.xpath("//div[@class='layout-left']/div[@class='bus-lzinfo mb20']//ul/li[2]/text()")[0].lstrip("票价信息:")
test2Item["bus_cost"]=bus_cost
#公交更新时间
bus_update= tree.xpath("//div[@class='layout-left']/div[@class='bus-lzinfo mb20']//ul/li[4]/text()")[0].lstrip("最后更新:")
test2Item["bus_update"] = bus_update
items.append(test2Item)
return items
4、piplines
import json
class Test2Pipeline(object):
def open_spider(self, spider): # 在爬虫开启的时候仅执行一次
if spider.name == 'test2Item':
self.f = open('json1.txt', 'a', encoding='utf-8')
def close_spider(self, spider): # 在爬虫关闭的时候仅执行一次
if spider.name == 'test2Item':
self.f.close()
def process_item(self, item, spider):
# 想输出真正的中文 ensure_ascii=False
if spider.name == 'test2Item':
self.f.write(json.dumps(dict(item), ensure_ascii=False, indent=2) + ',\n')
# 不return的情况下,另一个权重较低的pipeline将不会获得item
return item
5、结果