欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

spider----51job实战

程序员文章站 2022-05-08 11:20:39
...

bs4 学习:

import re

from bs4 import BeautifulSoup

# 生成那个对象
soup = BeautifulSoup(open('index.html', encoding='utf8'), 'lxml')
# print(type(soup))
# print(soup)
# ret = soup.a  # soup.a 查找的是第一个a标签,得到一个对象
# print(ret)
ret = soup.findAll('a')
print(ret[0].string)
# string = 'adasdasd我细化你很久了appdadaa'
# string.strip('adsp')

抓取51job案例如下:

import re
import urllib.request
from bs4 import BeautifulSoup
import time


def soup_content(content, fp):
    soup = BeautifulSoup(content, 'lxml')
    content = soup.select('#resultList > .el')[1:]
    # content = soup.find_all('div', class_=re.compile(r'^el$'))
    for ct in content:
        job_name = ct.select('.t1 a')[0].string.replace('\n', '').strip()
        company = ct.select('.t2 a')[0].string.replace('\n', '').strip()
        job_where = ct.select('.t3')[0].string
        salary = ct.select('.t4')[0].string
        publish_time = ct.select('.t5')[0].string
        # print(job_name, company, job_where, salary, publish_time)
        item = {
            'job_name': job_name,
            'company': company,
            'job_where': job_where,
            'salary': salary,
            'publish_time': publish_time,
        }
        # print(item)
        # 强制转换为字符串,然后把字符串写进去,每次写完一个换行
        string = str(item)
        fp.write(string + '\n')


#
def hanlder_request(keyword, page, url, ):
    url = url.format(keyword, page)
    print(url)
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    request = urllib.request.Request(url=url, headers=header)
    return request


def main():
    keyword = input("请输入要搜索的关键字: ")
    start_page = int(input("请输入要起始页码: "))
    end_page = int(input("请输入结束页码: "))
    # url = 'http://search.51job.com/list/020000,000000,0000,00,9,99,{},2,{}.html'
    url='http://search.51job.com/list/080200,000000,0000,00,9,99,{},2,{}.html'
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    fp = open('杭州python工作', 'w', encoding='utf8')
    for page in range(start_page, end_page + 1):
        request = hanlder_request(keyword, page, url, )
        response = urllib.request.urlopen(request)
        content = response.read().decode('gbk')
        print('正在下载第%s页......' % page)
        soup_content(content, fp)
        time.sleep(2)
        print('第%s页下载完成...' % page)
    fp.close()


if __name__ == "__main__":
    main()

 

相关标签: spider