一个用于爬取智联招聘信息的python爬虫
程序员文章站
2022-05-09 22:53:50
...
#coding:utf-8
import re
import requests
import Queue
import threading
import pymongo
from config import *
from urllib import urlencode
from lxml import etree
from bs4 import BeautifulSoup as bs
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
class Position_Spider(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self,queue):
while not queue.empty():
url = queue.get_nowait()
self.request(url)
def request(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
}
re = requests.get(url=url,headers=headers)
if re.status_code == 200:
html = re.content
self.get_position(html)
else:
print "open website filed"
print re.status_code
def get_position(self,html):
soup = bs(html,'lxml')
position_sites = soup.find_all(name="a",attrs={"style":"font-weight: bold"})
for position_site in position_sites:
position = position_site.string
print position
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
}
re = requests.get(url=position_site["href"],headers=headers)
if re.status_code == 200:
html = re.content
self.positions_detail(html)
else:
print "open website filed"
print re.status_code
def positions_detail(self,html):
selector = etree.HTML(html)
position_list= selector.xpath("//div[5]/div[1]/div[1]/h1/text()")
company_list= selector.xpath("//div[5]/div[1]/div[1]/h2/a/text()")
salary_list= selector.xpath("//div[6]/div[1]/ul/li[1]/strong/text()")
city_list= selector.xpath("//div[6]/div[1]/ul/li[2]/strong/a/text()")
record_list= selector.xpath("//div[6]/div[1]/ul/li[6]/strong/text()")
experience_list= selector.xpath("//div[6]/div[1]/ul/li[5]/strong/text()")
skill_list= selector.xpath("//div[@class='tab-inner-cont']/p[2]/span/text()")
for p,c ,s,ci,r,e,sk in zip(position_list,company_list,salary_list,city_list,record_list,experience_list,skill_list):
#print p.encode('utf-8')
print c.encode('utf-8')
print s.encode('utf-8')
print ci.encode('utf-8')
print r.encode('utf-8')
print e.encode('utf-8')
print sk.encode('utf-8')
result = {'position':p,'company':c,'salary':s,'city':ci,'record':r,'experience':e,'skill':sk}
save_to_mongo(result)
def save_to_mongo(self,result):
if db[MONGO_TABLE].insert(result):
print "存储成功"
return True
return False
def get_url(self):
queue = Queue.Queue()
for i in range(1,90):
queue.put("http://sou.zhaopin.com/jobs/searchresult.ashx?bj=160000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&isadv=0&p=" + str(i))
threads = []
threads_count = 1
for i in range(threads_count):
threads.append(self.run(queue))
for t in threads:
t.start()
for t in threads:
t.join()
def main():
position = Position_Spider()
#for province in provinces:
print position.get_url()
if __name__ == '__main__':
main()
xpath调用有点问题,单独拿出来没毛病,但是程序里边就有问题。