欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

一个用于爬取智联招聘信息的python爬虫

程序员文章站 2022-05-09 22:53:50
...
#coding:utf-8

import re
import requests
import Queue
import threading
import pymongo
from config import *
from urllib import urlencode
from lxml import etree
from bs4 import BeautifulSoup as bs

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

class Position_Spider(threading.Thread):

	def __init__(self):
		threading.Thread.__init__(self)

	def run(self,queue):
		while not queue.empty():
			url = queue.get_nowait()
			self.request(url)

	def request(self,url):
		headers = {
		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
		}
		re = requests.get(url=url,headers=headers)
		if re.status_code == 200:
			html = re.content
			self.get_position(html)
			
		else:
			print "open website filed"
			print re.status_code

	def get_position(self,html):
		soup = bs(html,'lxml')
		position_sites = soup.find_all(name="a",attrs={"style":"font-weight: bold"})
		for position_site in position_sites:
			position = position_site.string
			
			print position

			headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
			}
			re = requests.get(url=position_site["href"],headers=headers)
			if re.status_code == 200:
				html = re.content
				self.positions_detail(html)
			else:
				print "open website filed"
				print re.status_code

	def positions_detail(self,html):

		selector = etree.HTML(html)

		position_list= selector.xpath("//div[5]/div[1]/div[1]/h1/text()")
		company_list= selector.xpath("//div[5]/div[1]/div[1]/h2/a/text()")
		salary_list= selector.xpath("//div[6]/div[1]/ul/li[1]/strong/text()")
        city_list= selector.xpath("//div[6]/div[1]/ul/li[2]/strong/a/text()")
        record_list= selector.xpath("//div[6]/div[1]/ul/li[6]/strong/text()")
        experience_list= selector.xpath("//div[6]/div[1]/ul/li[5]/strong/text()")
        skill_list= selector.xpath("//div[@class='tab-inner-cont']/p[2]/span/text()")

        for p,c ,s,ci,r,e,sk in zip(position_list,company_list,salary_list,city_list,record_list,experience_list,skill_list):
        	#print p.encode('utf-8')
        	print c.encode('utf-8')
        	print s.encode('utf-8')
        	print ci.encode('utf-8')
        	print r.encode('utf-8')
        	print e.encode('utf-8')
        	print sk.encode('utf-8')
        	result = {'position':p,'company':c,'salary':s,'city':ci,'record':r,'experience':e,'skill':sk}
		save_to_mongo(result)

	def save_to_mongo(self,result):
		if db[MONGO_TABLE].insert(result):
			print "存储成功"
			return True
			return False

	def get_url(self):
		queue = Queue.Queue()

		for i in range(1,90):
			queue.put("http://sou.zhaopin.com/jobs/searchresult.ashx?bj=160000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&isadv=0&p=" + str(i))
	
		threads = []
		threads_count = 1
		
		for i in range(threads_count):
			threads.append(self.run(queue))

		for t in threads:
			t.start()

		for t in threads:
			t.join()

		
def main():
	position = Position_Spider()

	#for province in provinces:
	print position.get_url()
	
if __name__ == '__main__':
	main()

xpath调用有点问题,单独拿出来没毛病,但是程序里边就有问题。