欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

hous365的房源信息的采集

程序员文章站 2022-07-14 08:32:15
...

自己以前写的一个采集程序比较简单

<?php
include('global.php');
$conn = new db();

if(isset($_GET['company'])){
	$company = $_GET['company'];
}else{
	$company = '';
}
if(isset($_GET['memberid'])){
	$memberid = $_GET['memberid'];
}else{
	$memberid = '';
}
if(isset($_GET['zj_num'])){
	$zj_num = $_GET['zj_num'];
}else{
	$zj_num = '';
}
if(isset($_GET['zj_code'])){
	$zj_code = $_GET['zj_code'];
}else{
	$zj_code = '';
}


$url="http://zsb.house365.com/main.php?infotype=0&price=0&buildarea=0&district=0&keyword=&order_=1&page=".$zj_num."&agentcode=".$zj_code."&pkind=selllist&roomtype=&topic=&order=";
$text = @file_get_contents($url);

preg_match_all('/<td align="center" valign="middle"><a href=\'(.*?)\' target=\'_blank\' title="(.*?)"><img src="http:\/\/sell.house365.com\/images\/sellesflist_12.gif" width="77" height="18" border="0" \/><\/a><\/td>/i',$text,$row);

$num_all = 0;
$len = count($row[1]);
//$len = 1;
for($i=0;$i<$len;$i++)
{    
    $mrent = array();
    $url1 = $row[1][$i];      
    $text1 = @file_get_contents($url1);
    
    $mrent['coltype']    = 'second';    
    $mrent['menuid']     = '10';
    $mrent['memberid']   = $memberid;
    $mrent['memberprop'] = '2';
    $mrent['shangquan']  = '0'; 
    $mrent['infotype']   = 'sale'; 
    $mrent['jz']         = '1'; 
    $mrent['menuid']     ='10';
    $mrent['author']     = $company;//iconv("UTF-8", "GBK", $company);    

    //房源名称
    preg_match_all('/<td colspan="2" class="fy_name"><h1 style="text-align:center;font-size:20px;font-family:黑体;font-weight:normal">(.*?)<\/h1><\/td>/i',$text1,$name_arr);
    $louopan = trim($name_arr[1][0]);
    $mrent['loupan'] =iconv( "GBK","UTF-8", $louopan);    

    //售价
    preg_match_all('/<td width="215" class="dash_line">售价:<span>(.*?)<\/span> 万元/i',$text1,$rentall_arr);
    $rentall = trim($rentall_arr[1][0]);    
    $mrent['rentall'] = $rentall; 
    
    //面积  
    preg_match_all('/<td class="dash_line">面积:<span>(.*?)<\/span> 平方米<\/td>/i',$text1,$area_arr);
    $area = trim($area_arr[1][0]);
    $mrent['area'] = $area;   
        
    //单价   
    preg_match_all('/<td class="dash_line">单价:(\d*) 元\/平方米<\/td>/i',$text1,$rentavg_arr);
    $rentaverage = trim($rentavg_arr[1][0]);
    $mrent['rentaverage'] =$rentaverage;
    
    //户型
    preg_match_all('/<td class="dash_line">户型:(\d)房(\d)厅(\d)卫(\d)阳台<\/td>/i',$text1,$type_arr);     
    $shi = $type_arr[1][0];
    $ting = $type_arr[2][0];
    $wei = $type_arr[3][0];
    $tai = $type_arr[4][0];    
    
    $mrent['shi']  = $shi;
    $mrent['ting'] = $ting;
    $mrent['wei']  = $wei; 
    $mrent['tai']  = $tai;
    
    //楼层
    preg_match_all('/<td class="dash_line">楼层:(\d*)楼,总高(\d*)层<\/td>/i',$text1,$floor_arr); 
    $nowfloor = trim($floor_arr[1][0]);
    $allfloors = trim($floor_arr[2][0]);    
    $mrent['nowfloor'] = $nowfloor;
    $mrent['allfloors'] = $allfloors;
    
    //年代   
    preg_match_all('/<td class="dash_line">年代:(\d*)年<\/td>/i',$text1,$buildtime_arr); 
    $buildtime = trim($buildtime_arr[1][0]);
    $mrent['buildtime'] = $buildtime;
    
    //装修
    preg_match_all('/<td class="dash_line">装修:(.*?)<\/td>/i',$text1,$upholster_arr);  
    $upholster = trim($upholster_arr[1][0]);    
    switch($upholster){
        case "毛坯":
            $fitment = 1;
        break;
        case "简装":
            $fitment = 2;
        break;
        case "精装":
            $fitment = 3;
        break;
        case "豪华装":
            $fitment = 4;
        break;
        default:
            $fitment = 1;
        break;
    }      
    $mrent['fitment'] = $fitment;
        
    //朝向
    preg_match_all('/<td class="dash_line">朝向:(.*?)<\/td>/i',$text1,$face_arr);  
    $face_to = trim($face_arr[1][0]);
    $mrent['chaoxiang'] = iconv( "GBK","UTF-8", $face_to);    

    //物业类型
    preg_match_all('/<td class="dash_line">类型:<a (.*?)>(.*?)<\/a><\/td>/i',$text1,$housetype_arr); 
    $housetype = trim($housetype_arr[2][0]);
    $mrent['housetype'] = iconv( "GBK","UTF-8", $housetype);    
    $cat_arr =array(    
        "2"  => "住宅" ,
        "31" =>"写字楼" ,
        "32" =>"商铺" ,
        "33" =>"别墅" ,
        );
    if(in_array($housetype,$cat_arr)){
        $catid = array_search($housetype,$cat_arr); 
    }else{
        $catid = 0;
    }
    $mrent['catid'] = $catid;     
    
    //更新时间 
    preg_match_all('/<td class="dash_line">更新时间:(.*?)<\/td>/i',$text1,$uptime_arr);  
    $uptime = strtotime(trim($uptime_arr[1][0]));
    $mrent['uptime'] = $uptime;
    
    //区属
    preg_match_all('/<td width="220" class="dash_line">区属:<a (.*?)>(.*?)<\/a>\s<\/td>/i',$text1,$cat_arr);  
    $cat = trim($cat_arr[2][0]);
    $district =array(
            "46" => "玄武区",
            "45" => "鼓楼区",
            "48" => "白下区",
            "49" => "建邺区",
            "47" => "秦淮区",
            "44" => "下关区",
            "51" => "雨花台区",
            "50" => "栖霞区",
            "52" => "江宁区",
            "53" => "浦口区",
            "54" => "六合区",
            "55" => "溧水县",
            "56" => "高淳县",
            "60" => "其它",
            );
    $zoneid = array_search($cat,$district); 
    $mrent['zoneid'] = $zoneid;
    
    //板块
    preg_match_all('/<td width="150" class="dash_line">板块:<a (.*?)>(.*?)<\/a>&nbsp;<\/td>/i',$text1,$board_arr);  
    $board = trim($board_arr[2][0]);  
	$board = iconv( "GBK","UTF-8", $board);    
    $sql = "SELECT id FROM `quyu` where sort =$zoneid and `name` like '%".$board."%'";
    $quyu = $conn-> Query2SingleRowArray($sql);
    $mrent['quyu'] =$quyu['id'];
    
    //小区
    preg_match_all('/<td class="dash_line">小区:<a (.*?)>(.*?)<\/a><\/td>\s*<td class="dash_line">(.*?)<\/td>/i',$text1,$xiaoqu_arr);  
    $xq_name = trim($xiaoqu_arr[2][0]);
    $xq_address = trim($xiaoqu_arr[3][0]);    
    $mrent['address'] = iconv( "GBK","UTF-8", $xq_address); 
    
    //联系人    			
    preg_match_all('/<td width="245" align="center"><strong><span>(.*?)<\/span><\/strong><\/td>\s*<td width="185">联系人:<span>(.*?)<\/span><\/td>/i',$text1,$lxr_arr);  
    $lxr_tel = trim($lxr_arr[1][0]);
    $lxr_name = trim($lxr_arr[2][0]);
    $mrent['lxr'] = iconv( "GBK","UTF-8", $lxr_name);
    $th_tel = array('<span style="font-size:14px">','</span>');
    $lxr_tel = str_replace($th_tel,"",$lxr_tel);
    $tel_arr = explode("-",$lxr_tel);  
    for($tj=0;$tj<count($tel_arr);$tj++)
    {        
        $tel_len = strlen($tel_arr[$tj]);     
        if($tel_len==11)
        {			
			$mrent['lxdh'] = $tel_arr[$tj];        
            //$mrent['lxrshouji'] = $tel_arr[$tj];           
        }else{
            $mrent['lxdh'] = $tel_arr[$tj];           
        }
    }
    //详细信息
    preg_match_all('/<div class="infor_fp_con">\s*(.*?)\s*<\/div>/i',$text1,$info_arr); 
    $info = explode("<br>",$info_arr[1][0]);
    $fbdate = trim($info[0]);    
   
    for($j=1;$j<count($info);$j++)
    {
        $if_type = substr($info[$j],0,10);
        switch($if_type)
        {
            case "交通线路:":
                $bus = substr(trim($info[$j]),10);
            break;
            case "基础配套:":
                $base = substr(trim($info[$j]),10);
            break;
            case "附属设施:":
                $attach = substr(trim($info[$j]),10);
            break;           
        }
    }    
    $mrent['fbdate'] = iconv( "GBK","UTF-8", $fbdate);
 //   $mrent['froute'] = iconv( "GBK","UTF-8", $bus);    

    $jichu = $base.','.$attach;
    $jc = explode(",",$jichu);
    foreach($jc as $jc_value)
    {
       switch($jc_value)
       {
        case "宽带"    :
            $facnet = 1;
        case "管道煤气":
            $facgas = 1;
        case "有线电视":
            $factvnet = 1;
        case "电话"    : 
            $facphone = 1;
        case "冰箱"    :
            $facfridge = 1;
        case "电视机"  : 
            $factv = 1;
        case "洗衣机"  :  
            $facwasher = 1;
        case "热水器"  :  
            $facwheater = 1; 
        case "空调"    : 
            $facaircon = 1;   
        case "家具"    :  
            $facfurniture = 1;         
       }
    }    
    $mrent['facfurniture'] = $facfurniture;
    $mrent['factvnet']     = $factvnet;
    $mrent['factv']        = $factv;
    $mrent['facnet']       = $facnet;
    $mrent['facphone']     = $facphone;
    $mrent['facwheater']   = $facwheater;
    $mrent['facaircon']    = $facaircon;
    $mrent['facwasher']    = $facwasher;
    $mrent['facfridge']    = $facfridge;
    $mrent['facgas']       = $facgas;


    if($mrent['loupan']!='')
    {
        $sql = "SELECT id FROM `hou_mrent` where memberid =$memberid loupan ='".$mrent['loupan']."'"; 
        $chk = $conn-> Query2SingleRowArray($sql);
        if($chk['id']==''){
            $conn ->insert('hou_mrent',$mrent,$debug = true);  
			$rows = $conn->GetQueryAffectedRows();
			if($rows>0)
			{
				$num_all++;
			}
        }
    }
}

$note = "抓取完成,本页一共抓取".$num_all."条房源";
$note = iconv("GBK", "UTF-8", $note);    
?>    
<script type="text/javascript">
parent.document.all('note').innerHTML="<?=$note?>";
parent.document.all('btn_sc').disabled="";
parent.document.all('btn_zq').disabled="";
</script>

 

相关标签: 采集