spider----51job实战
程序员文章站
2022-05-08 11:20:39
...
bs4 学习:
import re
from bs4 import BeautifulSoup
# 生成那个对象
soup = BeautifulSoup(open('index.html', encoding='utf8'), 'lxml')
# print(type(soup))
# print(soup)
# ret = soup.a # soup.a 查找的是第一个a标签,得到一个对象
# print(ret)
ret = soup.findAll('a')
print(ret[0].string)
# string = 'adasdasd我细化你很久了appdadaa'
# string.strip('adsp')
抓取51job案例如下:
import re
import urllib.request
from bs4 import BeautifulSoup
import time
def soup_content(content, fp):
soup = BeautifulSoup(content, 'lxml')
content = soup.select('#resultList > .el')[1:]
# content = soup.find_all('div', class_=re.compile(r'^el$'))
for ct in content:
job_name = ct.select('.t1 a')[0].string.replace('\n', '').strip()
company = ct.select('.t2 a')[0].string.replace('\n', '').strip()
job_where = ct.select('.t3')[0].string
salary = ct.select('.t4')[0].string
publish_time = ct.select('.t5')[0].string
# print(job_name, company, job_where, salary, publish_time)
item = {
'job_name': job_name,
'company': company,
'job_where': job_where,
'salary': salary,
'publish_time': publish_time,
}
# print(item)
# 强制转换为字符串,然后把字符串写进去,每次写完一个换行
string = str(item)
fp.write(string + '\n')
#
def hanlder_request(keyword, page, url, ):
url = url.format(keyword, page)
print(url)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
request = urllib.request.Request(url=url, headers=header)
return request
def main():
keyword = input("请输入要搜索的关键字: ")
start_page = int(input("请输入要起始页码: "))
end_page = int(input("请输入结束页码: "))
# url = 'http://search.51job.com/list/020000,000000,0000,00,9,99,{},2,{}.html'
url='http://search.51job.com/list/080200,000000,0000,00,9,99,{},2,{}.html'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
fp = open('杭州python工作', 'w', encoding='utf8')
for page in range(start_page, end_page + 1):
request = hanlder_request(keyword, page, url, )
response = urllib.request.urlopen(request)
content = response.read().decode('gbk')
print('正在下载第%s页......' % page)
soup_content(content, fp)
time.sleep(2)
print('第%s页下载完成...' % page)
fp.close()
if __name__ == "__main__":
main()