异步获取risfond
程序员文章站
2022-05-13 20:59:25
...
1. 同步
# @Time : 2020/1/13 9:45
# @Author : GKL
# FileName : spider.py
# Software : PyCharm
import requests
from lxml import etree
import pandas
import time
class Spider(object):
def __init__(self):
# self.url = 'http://www.risfond.com/case/all-1'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.base_url = 'http://www.risfond.com'
def get_link(self, url):
response = requests.get(url, headers=self.headers).content.decode('utf-8')
page = etree.HTML(response)
link_list = page.xpath('//ul[@class="it-list"]/li//a/@href')
date_list = page.xpath('//ul[@class="it-list"]/li/span[@class="thetime"]/text()')
return link_list, date_list
def get_data(self, link_list, date_list):
items = []
for url, date in zip(link_list, date_list):
link = self.base_url + url
response = requests.get(link, headers=self.headers).content.decode('utf-8')
page = etree.HTML(response)
data = page.xpath('//div[@class="sc_d_l cf"]//span[@class="sc_d_con"]/text()')
# 职位名称
position_name = data[0]
# 年薪
salary = data[1]
# 企业名称
company_name = data[2]
# 工作地点
work_position = data[3]
# 案例日期
case_date = data[4]
# 所在行业
category = data[5]
# 职位周期
period = data[6]
# 上岗人数
people_num = data[7]
# 顾问团队
team = data[8]
item = [position_name, salary, company_name, work_position, case_date, category, period, people_num, team, date]
print(item)
items.append(item)
return items
def save_csv(self, items):
pd = pandas.DataFrame(data=items)
# 保存到CSV, mode = 'a' 追加模式
pd.to_csv('data.csv', index=None, header=None, mode='a')
def __call__(self, *args, **kwargs):
# call 方法:类的实例可调用
for i in range(1, 11):
url = 'http://www.risfond.com/case/all-{}'.format(i)
print('正在爬取>>>: ', url)
link_list, date_list = self.get_link(url)
items = self.get_data(link_list, date_list)
self.save_csv(items)
if __name__ == '__main__':
t0 = time.time()
Spider()()
print(time.time()-t0)
# 10个请求
# 82.57172274589539秒
2. 异步
import aiohttp
import asyncio
import re
import time
from lxml import etree
import pandas
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
base_url = 'http://www.risfond.com'
async def download(url):
"""
异步爬虫方法
:param url:
:return:
"""
items = []
# 定义一个ClientSession对象
async with aiohttp.ClientSession() as session:
# session.get请求url 获取response,类似于requests的get请求
async with session.get(url) as response:
# 网页源码,注意这里需要把编码带上,否则无法起到异步作用
html = await response.text(encoding='utf-8')
page = etree.HTML(html)
link_list = page.xpath('//ul[@class="it-list"]/li//a/@href')
date_list = page.xpath('//ul[@class="it-list"]/li/span[@class="thetime"]/text()')
for url, date in zip(link_list, date_list):
link = 'http://www.risfond.com' + url
async with session.get(link) as response:
html2 = await response.text(encoding='utf-8')
page = etree.HTML(html2)
data = page.xpath('//div[@class="sc_d_l cf"]//span[@class="sc_d_con"]/text()')
# 职位名称
position_name = data[0]
# 年薪
salary = data[1]
# 企业名称
company_name = data[2]
# 工作地点
work_position = data[3]
# 案例日期
case_date = data[4]
# 所在行业
category = data[5]
# 职位周期
period = data[6]
# 上岗人数
people_num = data[7]
# 顾问团队
team = data[8]
item = [position_name, salary, company_name, work_position, case_date, category, period, people_num,
team, date]
print(item)
items.append(item)
pd = pandas.DataFrame(data=items)
# 保存到CSV, mode = 'a' 追加模式
pd.to_csv('data.csv', index=None, header=None, mode='a')
if __name__ == '__main__':
t1 = time.time()
# 定义url
urls = ['http://www.risfond.com/case/all-{}'.format(i) for i in range(1, 301)]
# 获取当前事件循环
loop = asyncio.get_event_loop()
# 添加任务
tasks = [asyncio.ensure_future(download(url)) for url in urls]
# asyncio.gather接受一堆task
tasks = asyncio.gather(*tasks)
# 运行和停止循环
loop.run_until_complete(tasks)
print(time.time()-t1)
# 10个请求
# 4.602263450622559秒
# 每分钟7000条数据
上一篇: props属性验证
下一篇: 天气谚语大全小学一年级