requests_爬取链家面积和价格
程序员文章站
2022-07-13 13:00:31
...
#导入模块
import os
import re
import pandas as pd
import random
import time
import requests
爬取数据
# 存储
datalist = []
for i in range(1,11): # 爬取10页
print('正在爬取第%s页'%i)
url = 'https://gz.lianjia.com/zufang/pg'+str(i)+'/#contentList'
proxies = [{'http': 'http://58.212.42.116:36708'}, {'http':'http://117.57.91.53:9999'}, {'http':'123.169.35.184:9999'}]
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36'}
response = requests.get(url,headers=header, proxies = random.choice(proxies))
if response.status_code == 200:
html = response.text
# 正则表达式
nameregex = re.compile('<a target="_blank" href=".*?">\n\s+(.*?)\s+</a>') # \s表示空白字符即不能打印的字符
name = re.findall(nameregex,html) # 找出所有小区的名字
arearegex = re.compile('([0-9.]+)㎡')
area = re.findall(arearegex,html) # 找出所有租房的面积
priceregex = re.compile('<em>([0-9.]+)</em> 元/月')
price = re.findall(priceregex,html)
# 存储
for i in range(len(name)):
datalist.append([name[i], float(area[i]), float(price[i])])
# 设置一下间隔时间,防止被封禁
time.sleep(random.randint(6,8))
df = pd.DataFrame(datalist,columns=['name','area','price']).sort_values('area',ascending = False)
df.to_csv('D:\\Desktop\\爬虫_anaconda\\链家面积和价格.csv',index=False)
print('保存完成')
上一篇: requests_猫眼电影
下一篇: Neural Network 初步
推荐阅读