欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

requests_爬取链家面积和价格

程序员文章站 2022-07-13 13:00:31
...
#导入模块
import os
import re
import pandas as pd
import random
import time
import requests
爬取数据
# 存储
datalist = []

for i in range(1,11): # 爬取10页
    print('正在爬取第%s页'%i)
    url = 'https://gz.lianjia.com/zufang/pg'+str(i)+'/#contentList'
    proxies = [{'http': 'http://58.212.42.116:36708'}, {'http':'http://117.57.91.53:9999'}, {'http':'123.169.35.184:9999'}]
    header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36'}
    response = requests.get(url,headers=header, proxies = random.choice(proxies))

    if response.status_code == 200:
        html = response.text
        
        # 正则表达式
        nameregex = re.compile('<a target="_blank" href=".*?">\n\s+(.*?)\s+</a>')  # \s表示空白字符即不能打印的字符
        name = re.findall(nameregex,html)  # 找出所有小区的名字
        arearegex = re.compile('([0-9.]+)㎡')
        area = re.findall(arearegex,html) # 找出所有租房的面积
        priceregex = re.compile('<em>([0-9.]+)</em> 元/月')
        price = re.findall(priceregex,html)

        # 存储
        for i in range(len(name)):
            datalist.append([name[i], float(area[i]), float(price[i])])
            
    # 设置一下间隔时间,防止被封禁        
    time.sleep(random.randint(6,8))
df = pd.DataFrame(datalist,columns=['name','area','price']).sort_values('area',ascending = False)
df.to_csv('D:\\Desktop\\爬虫_anaconda\\链家面积和价格.csv',index=False)
print('保存完成')

 

相关标签: python 爬虫