智联招聘职位信息爬取并写入xls表中
程序员文章站
2022-05-09 17:44:12
...
环境:python2.7
1.cmd命令行下载第三方资源包:pip install xlwt(用于往表格中写入数据)
主要代码
#coding: utf-8
import requests
import re
import xlwt
#headers模拟浏览器访问url,避免反爬虫措施的一种
#主要通过提前手动浏览器打开该网页链接,然后查看元素,在网络部分找到cookies里的请求头中User-Agent的信息即是其内容
headers={'User-Agent':'Mozilla/5.0()windows NT 10.0;wow64;rv:52.0)Geck/20100101 Firefox/52.0'}
class ZLZP(object):
def __init__(self):
self.city1=''
self.job1=''
self.html=''
self.result_list=[]
self.total_page=0
def input_data(self):
city_list = []
# 输入查询的城市,最多输入五个
while len(city_list) < 5:
city = raw_input('请输入要查询的城市(最多5个,输入0结束):')
if city == '0':
break
city_list.append(city)
# 可以把列表中的每一条数据使用字符串隔开,最终返回一个字符串
self.city1 = '%2B'.join(city_list)
self.job1 = raw_input('工作:')
def get_html(self,city,job,pg):
url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%s&kw=%s&isadv=0&sg=002566a8ae0d4f978d7d1d0b44fc8cc4&p=%s' % (
city,job,pg)
self.html = requests.get(url, headers=headers).content
def get_total_page(self):
self.get_html(self.city1,self.job1,1)
pg_pat=re.compile(r'this.form.goto.value,(.*?),')
total_pg=re.search(pg_pat,self.html)
self.total_page=int(total_pg.group(1))
def get_data(self):
print'共%s页|'%self.total_page
pages=input('请输入您想打印的数据页数:')
while pages not in range(1,self.total_page+1):
pages=input('输入非法,请重新输入:')
total_page=pages
for x in range(1,total_page+1):
#print '========================================第%s页=============================================' %x
self.get_html(self.city1,self.job1,x)
pattern = re.compile(
r'<a style="font-weight.*?blank">(.*?)</a>.*?<td class="gsmc">.*?blank">(.*?)</a>.*?<td class="zwyx">(.*?)<.*?<td class="gzdd">(.*?)<',
re.S)
rs = re.findall(pattern, self.html)
self.result_list.append(rs)
def save_data(self):
workbook = xlwt.Workbook(encoding='utf-8')
# 新增一张表,用于存储数据
sheet = workbook.add_sheet(u'智联招聘职位表')
sheet.write(0, 0, '职位名称')
sheet.write(0, 1, '公司名称')
sheet.write(0, 2, '月薪')
sheet.write(0, 3, '地点')
i = 1
#file_name =self.city1+self.job1+'.txt'
#file = open(file_name.decode('utf-8'), 'w')
for x in range(0,len(self.result_list)):
rs=self.result_list[x]
print '========================================正在写入第%s页,请稍后。。。=====================================' %(x+1)
for detail in rs:
job = detail[0]
job_pat = re.compile('<.*?>')
job = re.sub(job_pat, '', job)
job = job.replace(' ', '')
company = detail[1]
yuexin = detail[2]
city = detail[3]
#file.write('%s %s %s %s\n' % (job, company, yuexin, city))
sheet.write(i, 0, job)
sheet.write(i, 1, company)
sheet.write(i, 2, yuexin)
sheet.write(i, 3, city)
workbook.save(u'智联招聘%s职位信息表.xls'%self.job1)
i+=1
#file.close()
print('写入完成!!!')
def start(self):
self.input_data()
self.get_html(self.city1,self.job1,1)
self.get_total_page()
self.get_data()
self.save_data()
zlzp=ZLZP()
zlzp.start()
其中有部分#注释部分为将所得数据写入.txt文件中,效果相同上一篇: 北京大暑吃什么好?
推荐阅读