实例讲解实现抓取网上房产信息的ASP程序
<!-- #include file="conn.asp" -->
<!-- #include file="inc/function.asp" -->
<!doctype html public "-//w3c//dtd html 4.01 transitional//en" "http://www.w3.org/tr/html4/loose.dtd">
<html>
<head>
<title>untitled document</title>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
<meta http-equiv="refresh" content="300;url=steal_house.asp">
</head>
<body>
<%
on error resume next
'
server.scripttimeout = 999999
'========================================================
'字符编码函数
'====================================================
function bytestobstr(body,code)
dim objstream
set objstream = server.createobject("adodb.stream")
objstream.type = 1
objstream.mode =3
objstream.open
objstream.write body
objstream.position = 0
objstream.type = 2
objstream.charset =code
bytestobstr = objstream.readtext
objstream.close
set objstream = nothing
end function
'取行字符串在另一字符串中的出现位置
function newstring(wstr,strng)
newstring=instr(lcase(wstr),lcase(strng))
if newstring<=0 then newstring=len(wstr)
end function
'替换字符串函数
function replacestr(ori,str1,str2)
replacestr=replace(ori,str1,str2)
end function
'====================================================
function readxml(url,code,start,ends)
set osend=createobject("microsoft.xmlhttp")
sourcecode = osend.open ("get",url,false)
osend.send()
readxml=bytestobstr(osend.responsebody,code )
start=instr(readxml,start)
readxml=mid(readxml,start)
ends=instr(readxml,ends)
readxml=left(readxml,ends-1)
end function
function substr(body,start,ends)
start=instr(body,start)
substr=mid(body,start+len(start)+1)
ends=instr(substr,ends)
substr=left(substr,ends-1)
end function
dim getcont,newscontent
dim url,title
url="新闻网址knowsky.com
getcont=readxml(url,"gb2312","<table class=k2 border=""0""","</table>")
getcont=regexhtml(getcont)
dim keyid,newsclass,city,position,housetype,level,area,price,demostra
dim contactman,contact
for i=2 to ubound(getcont)
response.write(getcont(i)&"__<br>")
templink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" onclick")-10)
templink=replace(templink,"../","")
response.write(i&":"&templink&"<br>")
newscontent=readxml(templink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")
newscontent=removehtml(newscontent)
newscontent=replace(newscontent,vbcrlf,"")
newscontent=replace(newscontent,vbnewline,"")
newscontent=replace(newscontent," ","")
newscontent=replace(newscontent," ","")
newscontent=replace(newscontent," ","")
newscontent=replace(newscontent,"\n","")
newscontent=replace(newscontent,chr(10),"")
newscontent=replace(newscontent,chr(13),"")
'===============get content=======================
response.write(newscontent)
keyid=substr(newscontent,"列号:","信息类别:")
newsclass=substr(newscontent,"类别:","所在城市:")
city=substr(newscontent,"城市:","房屋具体位置:")
position=substr(newscontent,"位置:","房屋类型:")
housetype=substr(newscontent,"类型:","楼层:")
level=substr(newscontent,"楼层:","使用面积:")
area=substr(newscontent,"面积:","房价:")
price=substr(newscontent,"房价:","其他说明:")
demostra=substr(newscontent,"说明:","联系人:")
contactman=substr(newscontent,"联系人:","联系方式:")
contact=substr(newscontent,"联系方式:","信息来源:")
response.write("总序列号:"&keyid&"<br>")
response.write("信息类别:"&newsclass&"<br>")
response.write("所在城市:"&city&"<br>")
response.write("房屋具体位置:"&position&"<br>")
response.write("房屋类型:"&housetype&"<br>")
response.write("楼层:"&level&"<br>")
response.write("使用面积:"&area&"<br>")
response.write("房价:"&price&"<br>")
response.write("其他说明:"&demostra&"<br>")
response.write("联系人:"&contactman&"<br>")
response.write("联系方式:"&contact&"<br>")
'title=removehtml(aa(i))
'response.write("title:"&title)
for n=0 to application.contents.count
if(application.contents(n)=keyid) then
ifexit=true
end if
next
if not ifexit then
application(time&i)=keyid
'添加到
'====================================================
set rs=server.createobject("adodb.recordset")
rs.open "select top 1 * from news order by id desc",conn,3,3
rs.addnew
rs("newsclass")=newsclass
rs("city")=city
rs("position")=position
rs("housetype")=housetype
rs("level")=level
rs("area")=area
rs("price")=price
rs("demostra")=demostra
rs("contactman")=contactman
rs("contact")=contact
rs.update
rs.close
set rs=nothing
end if
'==================================================
next
function removetag(body)
set regex = new regexp
regex.pattern = "<[a].*?<\/[a]>"
regex.ignorecase = true
regex.global = true
set matches = regex.execute(body)
dim i,arr(15),ifexit
i=0
j=0
for each match in matches
tempstr = match.value
tempstr=replace(tempstr,"<td>","")
tempstr=replace(tempstr,"</td>","")
tempstr=replace(tempstr,"<tr>","")
tempstr=replace(tempstr,"</tr>","")
arr(i)=tempstr
i=i+1
if(i>=15) then
exit for
end if
next
set regex=nothing
set matches =nothing
removetag=arr
end function
function regexhtml(body)
dim r_arr(47),r_temp
set regex2 = new regexp
regex2.pattern ="<a.*?<\/a>"
regex2.ignorecase = true
regex2.global = true
set matches2 = regex2.execute(body)
iii=0
for each match in matches2
r_arr(iii)=match.value
iii=iii+1
next
regexhtml=r_arr
set regex2=nothing
set matches2=nothing
end function
'======================================================
conn.close
set conn=nothing
%>
</body>
</html>
function.asp
<%
'**************************************************
'函数名:gottopic
'作 用:截字符串,汉字一个算两个字符,英文算一个字符
'参 数:str ----原字符串
' strlen ----截取长度
'返回值:截取后的字符串
'**************************************************
function gottopic(str,strlen)
if str="" then
gottopic=""
exit function
end if
dim l,t,c, i
str=replace(replace(replace(replace(str," "," "),""",chr(34)),">",">"),"<","<")
str=replace(str,"?","")
l=len(str)
t=0
for i=1 to l
c=abs(asc(mid(str,i,1)))
if c>255 then
t=t+2
else
t=t+1
end if
if t>=strlen then
gottopic=left(str,i) & "…"
exit for
else
gottopic=str
end if
next
gottopic=replace(replace(replace(replace(gottopic," "," "),chr(34),"""),">",">"),"<","<")
end function
'=========================================================
'函数:removehtml(strhtml)
'功能:去除html标记
'参数:strhtml --要去除html标记的字符串
'=========================================================
function removehtml(strhtml)
dim objregexp, match, matches
set objregexp = new regexp
objregexp.ignorecase = true
objregexp.global = true
'取闭合的<>
objregexp.pattern = "<.+?>"
'进行匹配
set matches = objregexp.execute(strhtml)
' 遍历匹配集合,并替换掉匹配的项目
for each match in matches
strhtml=replace(strhtml,match.value,"")
next
removehtml=strhtml
set objregexp = nothing
set matches=nothing
end function
%>
conn.asp
<%
'on error resume next
set conn=server.createobject("adodb.connection")
con= "driver={microsoft access driver (*.mdb)};dbq=" & server.mappath("stest.mdb")
conn.open con
sub connclose
conn.close
set conn=nothing
end sub
%>
附:抓取信息的详细页面事例
总序列号: |
479280 |
信息类别:
|
出租 |
所在城市:
|
济南 |
房屋具体位置:
|
华龙路华信路交界口 |
房屋类型:
|
其他 |
楼层:
|
六层 |
使用面积:
|
24~240 平方米之间 |
房价:
|
0 [租赁:元/月,买卖:万元/套] |
其他说明:
|
华信商务楼3至6层小空间对外出租(0.5元/平起),本楼属纯商务办公投资使用,可用于办公写字间,周边设施齐全、交通便利(37、80、k95在本楼前经过),全产权、市证,楼内设施包括水、电、暖、电梯设施齐全,有意者可电讯! |
联系人:
|
鲁、王 |
联系方式:
|
88017966、86812217 |
信息来源:
|
2005-8-4 8:28:55 来自:218.98.86.175 |
点击次数:
|
19 |