python网络爬虫:pyquery
程序员文章站
2022-03-02 22:48:01
...
- pyquery:语法规则类似于jQuery,可以对HTML进行解析
pq = pyquery(html文档)
pq('css选择器')
items():获取到多个标签时,使用items()将pyquery转换为一个生成器
然后使用 for in 循环
filter('css选择器'):过滤
text():获取标签
attr('属性名'):获取属性值
from pyquery import PyQuery
import requests
def tencentjob(full_url):
html = load_data(full_url)
next_url = parse_page_data(html)
if 'javascript:;' != next_url :
next_url = 'https://hr.tencent.com/'+next_url
tencentjob(next_url)
def load_data(url):
'''
发起请求获取职位列表页页面源码
:param url:
:return:
'''
req_header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url,headers=req_header)
if response.status_code ==200:
return response.text
def parse_page_data(html):
'''
解析页面源码数据
:param html:
:return:
'''
# 实例化pyquery对象
html_pq = PyQuery(html)
# 提取职位列表
# tr_even = html_pq('tr.even')
# filter过滤
tr_even = html_pq('tr').filter('.even')
tr_odd = html_pq('tr').filter('.odd')
tr_all = tr_even + tr_odd
tr_all = tr_all.items()
# tr_even = tr_even.items()
# tr_odd = tr_odd.items()
print(tr_even,tr_odd)
print(type(tr_odd),type(tr_even))
for tr in tr_all:
# print(tr)
jobinfo = {}
# 获取标题
jobinfo['title'] = tr('td.l.square a').text()
# print(jobinfo['title'])
# 取详情地址,a 标签 href 属性(.attr('属性名'))
detail_url = 'https://hr.tencent.com/'+tr('td.l.square a').attr('href')
# print(detail_url)
# 职位类型 eq(1) 取指定索引的标签 索引值从0开始
jobinfo['type'] = tr('td').eq(1).text()
# 职位人数
jobinfo['number'] = tr('td').eq(2).text()
# 地点
jobinfo['address'] = tr('td').eq(3).text()
# 发布时间
jobinfo['time'] = tr('td').eq(4).text()
# 工作详情内容
html = load_data(detail_url)
jobinfo['content']=parse_detail_data(html)
print(jobinfo)
# 提取下一页的url地址
next_url = html_pq('a').filter('#next').attr('href')
return next_url
def parse_detail_data(html):
# 创建pyquery对象
html_pq = PyQuery(html)
# 取出详情内容所在的li标签
lis = html_pq('ul.squareli li')
content = []
# 取出 li 标签文本 放入列表中
for li in lis.items():
li_text = li.text()
content.append(li_text)
return ','.join(content)
if __name__ == '__main__':
# 设置起始偏移量
offset = 0
# 当前分页HTML源码
full_url = 'https://hr.tencent.com/position.php?&start=' + str(offset)
tencentjob(full_url)
上一篇: python DHT网络爬虫
下一篇: Python 简单网络爬虫