欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python网络爬虫:pyquery

程序员文章站 2022-03-02 22:48:01
...
  • pyquery:语法规则类似于jQuery,可以对HTML进行解析

pq = pyquery(html文档)
pq('css选择器')
items():获取到多个标签时,使用items()将pyquery转换为一个生成器
然后使用 for in 循环
filter('css选择器'):过滤
text():获取标签
attr('属性名'):获取属性值

from pyquery import PyQuery
import requests

def tencentjob(full_url):


    html = load_data(full_url)
    next_url = parse_page_data(html)
    if 'javascript:;' != next_url :
        next_url = 'https://hr.tencent.com/'+next_url
        tencentjob(next_url)


def load_data(url):
    '''
    发起请求获取职位列表页页面源码
    :param url:
    :return:
    '''
    req_header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }

    response = requests.get(url,headers=req_header)

    if response.status_code ==200:
        return response.text

def parse_page_data(html):
    '''
    解析页面源码数据
    :param html:
    :return:
    '''
    # 实例化pyquery对象
    html_pq = PyQuery(html)
    # 提取职位列表
    # tr_even = html_pq('tr.even')
    # filter过滤
    tr_even = html_pq('tr').filter('.even')
    tr_odd = html_pq('tr').filter('.odd')

    tr_all = tr_even + tr_odd
    tr_all = tr_all.items()

    # tr_even = tr_even.items()
    # tr_odd = tr_odd.items()


    print(tr_even,tr_odd)
    print(type(tr_odd),type(tr_even))
    for tr in tr_all:
        # print(tr)
        jobinfo = {}
        # 获取标题
        jobinfo['title'] = tr('td.l.square a').text()
        # print(jobinfo['title'])
        # 取详情地址,a 标签 href 属性(.attr('属性名'))
        detail_url = 'https://hr.tencent.com/'+tr('td.l.square a').attr('href')
        # print(detail_url)
        # 职位类型 eq(1) 取指定索引的标签 索引值从0开始
        jobinfo['type'] = tr('td').eq(1).text()
        # 职位人数
        jobinfo['number'] = tr('td').eq(2).text()
        # 地点
        jobinfo['address'] = tr('td').eq(3).text()
        # 发布时间
        jobinfo['time'] = tr('td').eq(4).text()
        # 工作详情内容
        html = load_data(detail_url)
        jobinfo['content']=parse_detail_data(html)
        print(jobinfo)
    # 提取下一页的url地址
    next_url = html_pq('a').filter('#next').attr('href')
    return next_url

def parse_detail_data(html):
    # 创建pyquery对象
    html_pq = PyQuery(html)
    #  取出详情内容所在的li标签
    lis = html_pq('ul.squareli li')
    content = []
    # 取出 li 标签文本 放入列表中
    for li in lis.items():
        li_text = li.text()
        content.append(li_text)

    return ','.join(content)


if __name__ == '__main__':
    # 设置起始偏移量
    offset = 0
    # 当前分页HTML源码
    full_url = 'https://hr.tencent.com/position.php?&start=' + str(offset)
    tencentjob(full_url)