欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

异步获取risfond

程序员文章站 2022-05-13 20:59:25
...

文章目录

1. 同步

# @Time : 2020/1/13 9:45
# @Author : GKL
# FileName : spider.py
# Software : PyCharm

import requests
from lxml import etree
import pandas
import time



class Spider(object):

    def __init__(self):
        # self.url = 'http://www.risfond.com/case/all-1'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self.base_url = 'http://www.risfond.com'

    def get_link(self, url):
        response = requests.get(url, headers=self.headers).content.decode('utf-8')

        page = etree.HTML(response)

        link_list = page.xpath('//ul[@class="it-list"]/li//a/@href')
        date_list = page.xpath('//ul[@class="it-list"]/li/span[@class="thetime"]/text()')

        return link_list, date_list

    def get_data(self, link_list, date_list):

        items = []

        for url, date in zip(link_list, date_list):
            link = self.base_url + url
            response = requests.get(link, headers=self.headers).content.decode('utf-8')

            page = etree.HTML(response)

            data = page.xpath('//div[@class="sc_d_l cf"]//span[@class="sc_d_con"]/text()')

            # 职位名称
            position_name = data[0]

            # 年薪
            salary = data[1]

            # 企业名称
            company_name = data[2]

            # 工作地点
            work_position = data[3]

            # 案例日期
            case_date = data[4]

            # 所在行业
            category = data[5]

            # 职位周期
            period = data[6]

            # 上岗人数
            people_num = data[7]

            # 顾问团队
            team = data[8]

            item = [position_name, salary, company_name, work_position, case_date, category, period, people_num, team, date]
            print(item)
            items.append(item)

        return items

    def save_csv(self, items):
        pd = pandas.DataFrame(data=items)
        # 保存到CSV, mode = 'a' 追加模式
        pd.to_csv('data.csv', index=None, header=None, mode='a')

    def __call__(self, *args, **kwargs):
        # call 方法:类的实例可调用
        for i in range(1, 11):
            url = 'http://www.risfond.com/case/all-{}'.format(i)
            print('正在爬取>>>: ', url)
            link_list, date_list = self.get_link(url)
            items = self.get_data(link_list, date_list)
            self.save_csv(items)


if __name__ == '__main__':
    t0 = time.time()
    Spider()()
    print(time.time()-t0)
    
 #     10个请求
# 82.57172274589539秒


2. 异步

import aiohttp
import asyncio
import re
import time
from lxml import etree
import pandas


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

base_url = 'http://www.risfond.com'

async def download(url):
    """
    异步爬虫方法
    :param url:
    :return:
    """
    items = []
    # 定义一个ClientSession对象
    async with aiohttp.ClientSession() as session:
        # session.get请求url 获取response,类似于requests的get请求
        async with session.get(url) as response:
            # 网页源码,注意这里需要把编码带上,否则无法起到异步作用
            html = await response.text(encoding='utf-8')

            page = etree.HTML(html)

            link_list = page.xpath('//ul[@class="it-list"]/li//a/@href')
            date_list = page.xpath('//ul[@class="it-list"]/li/span[@class="thetime"]/text()')

            for url, date in zip(link_list, date_list):
                link = 'http://www.risfond.com' + url

                async with session.get(link) as response:
                    html2 = await response.text(encoding='utf-8')

                    page = etree.HTML(html2)

                    data = page.xpath('//div[@class="sc_d_l cf"]//span[@class="sc_d_con"]/text()')

                    # 职位名称
                    position_name = data[0]

                    # 年薪
                    salary = data[1]

                    # 企业名称
                    company_name = data[2]

                    # 工作地点
                    work_position = data[3]

                    # 案例日期
                    case_date = data[4]

                    # 所在行业
                    category = data[5]

                    # 职位周期
                    period = data[6]

                    # 上岗人数
                    people_num = data[7]

                    # 顾问团队
                    team = data[8]

                    item = [position_name, salary, company_name, work_position, case_date, category, period, people_num,
                            team, date]

                    print(item)
                    items.append(item)

    pd = pandas.DataFrame(data=items)
    # 保存到CSV, mode = 'a' 追加模式
    pd.to_csv('data.csv', index=None, header=None, mode='a')


if __name__ == '__main__':

    t1 = time.time()
    # 定义url
    urls = ['http://www.risfond.com/case/all-{}'.format(i) for i in range(1, 301)]
    # 获取当前事件循环
    loop = asyncio.get_event_loop()

    # 添加任务
    tasks = [asyncio.ensure_future(download(url)) for url in urls]

    # asyncio.gather接受一堆task
    tasks = asyncio.gather(*tasks)

    # 运行和停止循环
    loop.run_until_complete(tasks)

    print(time.time()-t1)

#     10个请求
#     4.602263450622559秒
#  每分钟7000条数据


相关标签: async