爬取贝壳租房信息存储到mongoDB
程序员文章站
2022-07-14 10:56:31
...
前几天链家网升级成贝壳找房了,融合了十几家知名公寓的租房信息,太赞了,正好有多方面数据来源可以分析。代码是存储到mongoDB上(忍不住吐槽一下,mongoDB对32位的系统真不友好,各种问题,要么要安装补丁,每次打开都要更改存储位置,有时候还要解除进程锁),也可以改成存储到Excel中,把注释去掉就行了。下面是贝壳找房的代码:
import pymongo
import requests,codecs
import pymongo,time
from lxml import html
from multiprocessing import Pool
client = pymongo.MongoClient('mongodb://localhost:27017')
db=client['testdb']
myset=db['beike']
def get_content(j):
print('正在爬取第{}页,还剩{}页'.format(j,100-j))
url='https://sh.zu.ke.com/zufang/pg{}/#contentList'.format(j)
r=requests.get(url)
r=html.fromstring(r.text)
lenth=len(r.xpath('//div[@class="content__article"]/div[1]/div'))
try:
for i in range(1,lenth+1):
urls=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/p[1]/a/@href'.format(i))[0]
community=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/p[1]/a/text()'.format(i))[0].replace('\n','').strip()
addr=r.xpath('//div[@class="content__article"]/div[1]/div[1]/div[{}]/p[2]/a/text()'.format(i))
landlord=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/p[3]/text()'.format(i))[0].replace('\n','').strip()
postime=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/p[4]/text()'.format(i))[0]
label=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/p[5]/i/text()'.format(i))
price=r.xpath('//div[@class="content__article"]/div[1]/div[{}]/div[1]/span/em/text()'.format(i))[0]
# output="{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(community,addr,price,landlord,postime,label,urls)
# savetoexcel(output)
info={'community':community,'addr':addr,'price':price,'lanlord':landlord,'postime':postime,'label':label,'urls':urls}
try:
myset.insert(info)
except:
print('写入失败')
except Exception as e:
print(e)
print('爬取失败')
def savetoexcel(output):
try:
f=codecs.open('beike.xls','a+','utf-8')
f.write(output)
f.close()
except Exception as e:
print('写入失败')
if __name__ == '__main__':
pool=Pool(processes=2)
pool.map(get_content,list(range(1,101)))
pool.close()
pool.join()
爬到之后,是这样的:
转化为DataFrame方便用pandas操作
import pymongo
import pandas as pd
#建立连接
client = pymongo.MongoClient('mongodb://localhost:27017')
db=client.testdb
myset=db.beike
#从mongoDB读取数据转化为dataframe
def get_data():
df=pd.DataFrame(columns=['community','addr','price','lanlord','postime','label','urls'])
for i in myset.find().limit(100):
del i['_id']
pd_data = pd.DataFrame.from_dict(i, orient='index').T
df = df.append(pd_data, ignore_index=True)
return df
df=get_data()
landlord=df.lanlord.value_counts()
print(landlord)
上一篇: SICP 2.4 习题答案
下一篇: .gitignore