urllib库爬取51job
程序员文章站
2022-03-07 23:47:55
urllib库爬取51job首先打开51job网页,分析网页结构,发现自己想要的字段全部在网页源码里,以json格式存储,且编码为‘gbk’所以我们要通过正则表达式把它提取出来代码如下:url='https://search.51job.com/list/190200%252c040000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&...
urllib库爬取51job
首先打开51job网页,分析网页结构,发现自己想要的字段全部在网页源码里,以json格式存储,且编码为‘gbk’
所以我们要通过正则表达式把它提取出来
代码如下:
url='https://search.51job.com/list/190200%252c040000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
req=request.urlopen(url)
html=req.read().decode('gbk')
data=re.compile('window.__SEARCH_RESULT__ = (.*?)</script>',re.S)
newdata=re.findall(data,html)
#正则表达式会返回一个列表,所以要加索引,不然json无法解码
newhtml=json.loads(newdata[0])
拿到的json形式为字典形式,所以只要用for循环去遍历分析可以拿到我们想要的值了
for h in range(0,len(newhtml['engine_search_result'])):
key=newhtml['engine_search_result'][h]
#拿到二级网址链接,并且请求
href=key['job_href']
rsp=request.urlopen(href)
#爬取二级链接的时候发现decode(‘gbk’)会报错,
#是因为二级链接下有些gbk不识别的字符,
#所以加上errors = 'ignore'参数就好了不然会报错
newhtml1=rsp.read().decode('gbk',errors = 'ignore')
#使用xpth拿二级网址里的信息
jobmessage=etree.HTML(newhtml1).xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div//p/text()')
jobmessage1=''
for i in jobmessage:
jobmessage1+=(i+'\n')
jobadress=etree.HTML(newhtml1).xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()')
print('工作名称:',key['job_name'])
print('公司名称:',key['company_name'])
print('工资:',key['providesalary_text'])
print('城市:',key['workarea_text'])
print('发布日期:',key['updatedate'])
print('公司规模:',key['companytype_text']+'/'+key['companysize_text'])
print('发布时间:',key['issuedate'])
print('福利:',key['jobwelf'])
print('工作要求',key['attribute_text'])
print('职位信息',jobmessage1.replace(' ','').replace('\r\n',''))
#有些信息里没有上班地址,所以我们加一个判断,不然xpath拿到为空的话会报错
if jobadress==[]:
print('上班地址:无')
else:
print('上班地址',jobadress[0])
这样使用urllib配合正则表达式和xapth爬取51job网就完成了
完整代码如下;
import re
import json
from lxml import etree
from urllib import request
def get_data(page):
for i in range(1,page+1):
url='https://search.51job.com/list/190200%252c040000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(i)
req=request.urlopen(url)
html=req.read().decode('gbk')
data=re.compile('window.__SEARCH_RESULT__ = (.*?)</script>',re.S)
newdata=re.findall(data,html)
newhtml=json.loads(newdata[0])
print(newhtml)
for h in range(0,len(newhtml['engine_search_result'])):
key=newhtml['engine_search_result'][h]
href=key['job_href']
rsp=request.urlopen(href)
newhtml1=rsp.read().decode('gbk',errors = 'ignore')
# print(newhtml1)
jobmessage=etree.HTML(newhtml1).xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div//p/text()')
jobmessage1=''
for i in jobmessage:
jobmessage1+=(i+'\n')
jobadress=etree.HTML(newhtml1).xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()')
print('工作名称:',key['job_name'])
print('公司名称:',key['company_name'])
print('工资:',key['providesalary_text'])
print('城市:',key['workarea_text'])
print('发布日期:',key['updatedate'])
print('公司规模:',key['companytype_text']+'/'+key['companysize_text'])
print('发布时间:',key['issuedate'])
print('福利:',key['jobwelf'])
print('工作要求',key['attribute_text'])
print('职位信息',jobmessage1.replace(' ','').replace('\r\n',''))
if jobadress==[]:
print('上班地址:无')
else:
print('上班地址',jobadress[0])
print('=='*20+'第{0}页第{1}条职位信息爬取完毕'.format(i,h)+'=='*20)
if __name__ == '__main__':
page=int(input('请输入要爬取的页数:'))
get_data(page)
注:本文章仅供参考学习
本文地址:https://blog.csdn.net/zp1534074749/article/details/109843836