欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

智联招聘职位信息爬取并写入xls表中

程序员文章站 2022-05-09 17:44:12
...

环境:python2.7

1.cmd命令行下载第三方资源包:pip install xlwt(用于往表格中写入数据)


主要代码

#coding: utf-8
import requests
import re
import xlwt
#headers模拟浏览器访问url,避免反爬虫措施的一种
#主要通过提前手动浏览器打开该网页链接,然后查看元素,在网络部分找到cookies里的请求头中User-Agent的信息即是其内容
headers={'User-Agent':'Mozilla/5.0()windows NT 10.0;wow64;rv:52.0)Geck/20100101 Firefox/52.0'}
class ZLZP(object):
    def __init__(self):
        self.city1=''
        self.job1=''
        self.html=''
        self.result_list=[]
        self.total_page=0
    def input_data(self):
        city_list = []
        # 输入查询的城市,最多输入五个
        while len(city_list) < 5:
            city = raw_input('请输入要查询的城市(最多5个,输入0结束):')
            if city == '0':
                break
            city_list.append(city)
        # 可以把列表中的每一条数据使用字符串隔开,最终返回一个字符串
        self.city1 = '%2B'.join(city_list)
        self.job1 = raw_input('工作:')
    def get_html(self,city,job,pg):
        url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%s&kw=%s&isadv=0&sg=002566a8ae0d4f978d7d1d0b44fc8cc4&p=%s' % (
            city,job,pg)
        self.html = requests.get(url, headers=headers).content
    def get_total_page(self):
        self.get_html(self.city1,self.job1,1)
        pg_pat=re.compile(r'this.form.goto.value,(.*?),')
        total_pg=re.search(pg_pat,self.html)
        self.total_page=int(total_pg.group(1))
    def get_data(self):
        print'共%s页|'%self.total_page
        pages=input('请输入您想打印的数据页数:')
        while pages not in range(1,self.total_page+1):
            pages=input('输入非法,请重新输入:')
        total_page=pages
        for x in range(1,total_page+1):
            #print '========================================第%s页=============================================' %x
            self.get_html(self.city1,self.job1,x)
            pattern = re.compile(
                r'<a style="font-weight.*?blank">(.*?)</a>.*?<td class="gsmc">.*?blank">(.*?)</a>.*?<td class="zwyx">(.*?)<.*?<td class="gzdd">(.*?)<',
                re.S)
            rs = re.findall(pattern, self.html)
            self.result_list.append(rs)
    def save_data(self):
        workbook = xlwt.Workbook(encoding='utf-8')
        # 新增一张表,用于存储数据
        sheet = workbook.add_sheet(u'智联招聘职位表')
        sheet.write(0, 0, '职位名称')
        sheet.write(0, 1, '公司名称')
        sheet.write(0, 2, '月薪')
        sheet.write(0, 3, '地点')
        i = 1
        #file_name =self.city1+self.job1+'.txt'
        #file = open(file_name.decode('utf-8'), 'w')
        for x in range(0,len(self.result_list)):
            rs=self.result_list[x]
            print '========================================正在写入第%s页,请稍后。。。=====================================' %(x+1)
            for detail in rs:
                job = detail[0]
                job_pat = re.compile('<.*?>')
                job = re.sub(job_pat, '', job)
                job = job.replace(' ', '')
                company = detail[1]
                yuexin = detail[2]
                city = detail[3]
                #file.write('%s    %s    %s    %s\n' % (job, company, yuexin, city))
                sheet.write(i, 0, job)
                sheet.write(i, 1, company)
                sheet.write(i, 2, yuexin)
                sheet.write(i, 3, city)
                workbook.save(u'智联招聘%s职位信息表.xls'%self.job1)
                i+=1
        #file.close()
        print('写入完成!!!')
    def start(self):
        self.input_data()
        self.get_html(self.city1,self.job1,1)
        self.get_total_page()
        self.get_data()
        self.save_data()
zlzp=ZLZP()
zlzp.start()
其中有部分#注释部分为将所得数据写入.txt文件中,效果相同