Asp 使用 Microsoft.XMLHTTP 抓取网页内容并过滤需要的
程序员文章站
2022-04-15 10:21:20
asp 使用 microsoft.xmlhttp 抓取网页内容(没用乱码),并过滤需要的内容 示例源码: 复制代码 代码如下: <% dim xmlurl,http,...
asp 使用 microsoft.xmlhttp 抓取网页内容(没用乱码),并过滤需要的内容
示例源码:
<%
dim xmlurl,http,strhtml,strbody
xmlurl = request.querystring("u")
rem 异步读取xml源
set http = server.createobject("microsoft.xmlhttp")
http.open "post",xmlurl,false
http.setrequestheader "user-agent", "mozilla/4.0"
http.setrequestheader "connection", "keep-alive"
http.setrequestheader "content-type", "application/x-www-form-urlencoded"
http.send()
strhtml = bytestobstr(http.responsebody)
set http = nothing
rem 抓取主要内容
strbody = getbody(strhtml,"<div id=""div_newscontentc"" class=""cnt"">","</div>",0,0)
strbody =replace(strbody,"(本文首发于","")
strbody =replace(strbody,"财富动力网</a>,转载请注明出处。)","")
strbody =replace(strbody,"本文首发于,转载请注明出处。)","")
strbody =replace(strbody,"财富动力网</a>:http://www.927953.com","")
strbody =replace(strbody,"本文首发于","")
response.write regremovehref(strbody)
rem 获取对应网址响应的html
function bytestobstr(body)
dim objstream
set objstream = server.createobject("adodb.stream")
objstream.type = 1
objstream.mode =3
objstream.open
objstream.write body
objstream.position = 0
objstream.type = 2
objstream.charset = "utf-8"
'转换原来默认的utf-8编码转换成gb2312编码,否则直接用
'xmlhttp调用有中文字符的网页得到的将是乱码
bytestobstr = objstream.readtext
objstream.close
set objstream = nothing
end function
rem 使用正则表达式,抓取之内标记的内容
function getbody(constr,startstr,overstr,inclul,inclur)
if constr="$false$" or constr="" or isnull(constr)=true or startstr="" or isnull(startstr)=true or overstr="" or isnull(overstr)=true then
getbody="$false$"
exit function
end if
dim constrtemp
dim start,over
constrtemp=lcase(constr)
startstr=lcase(startstr)
overstr=lcase(overstr)
start = instrb(1, constrtemp, startstr, vbbinarycompare)
if start<=0 then
getbody="$false$"
exit function
else
if inclul=false then
start=start+lenb(startstr)
end if
end if
over=instrb(start,constrtemp,overstr,vbbinarycompare)
if over<=0 or over<=start then
getbody="$false$"
exit function
else
if inclur=true then
over=over+lenb(overstr)
end if
end if
getbody=midb(constr,start,over-start)
end function
rem 过滤a超链接
function regremovehref(htmlstr)
set ra = new regexp
ra.ignorecase = true
ra.global = true
ra.pattern = "<a[^>]+>(.+?)<\/a>"
regremovehref = replace(ra.replace(htmlstr,"$1"),"href=""http://www.927953.com""","")
end function
%>
效果图如下:
示例源码:
复制代码 代码如下:
<%
dim xmlurl,http,strhtml,strbody
xmlurl = request.querystring("u")
rem 异步读取xml源
set http = server.createobject("microsoft.xmlhttp")
http.open "post",xmlurl,false
http.setrequestheader "user-agent", "mozilla/4.0"
http.setrequestheader "connection", "keep-alive"
http.setrequestheader "content-type", "application/x-www-form-urlencoded"
http.send()
strhtml = bytestobstr(http.responsebody)
set http = nothing
rem 抓取主要内容
strbody = getbody(strhtml,"<div id=""div_newscontentc"" class=""cnt"">","</div>",0,0)
strbody =replace(strbody,"(本文首发于","")
strbody =replace(strbody,"财富动力网</a>,转载请注明出处。)","")
strbody =replace(strbody,"本文首发于,转载请注明出处。)","")
strbody =replace(strbody,"财富动力网</a>:http://www.927953.com","")
strbody =replace(strbody,"本文首发于","")
response.write regremovehref(strbody)
rem 获取对应网址响应的html
function bytestobstr(body)
dim objstream
set objstream = server.createobject("adodb.stream")
objstream.type = 1
objstream.mode =3
objstream.open
objstream.write body
objstream.position = 0
objstream.type = 2
objstream.charset = "utf-8"
'转换原来默认的utf-8编码转换成gb2312编码,否则直接用
'xmlhttp调用有中文字符的网页得到的将是乱码
bytestobstr = objstream.readtext
objstream.close
set objstream = nothing
end function
rem 使用正则表达式,抓取之内标记的内容
function getbody(constr,startstr,overstr,inclul,inclur)
if constr="$false$" or constr="" or isnull(constr)=true or startstr="" or isnull(startstr)=true or overstr="" or isnull(overstr)=true then
getbody="$false$"
exit function
end if
dim constrtemp
dim start,over
constrtemp=lcase(constr)
startstr=lcase(startstr)
overstr=lcase(overstr)
start = instrb(1, constrtemp, startstr, vbbinarycompare)
if start<=0 then
getbody="$false$"
exit function
else
if inclul=false then
start=start+lenb(startstr)
end if
end if
over=instrb(start,constrtemp,overstr,vbbinarycompare)
if over<=0 or over<=start then
getbody="$false$"
exit function
else
if inclur=true then
over=over+lenb(overstr)
end if
end if
getbody=midb(constr,start,over-start)
end function
rem 过滤a超链接
function regremovehref(htmlstr)
set ra = new regexp
ra.ignorecase = true
ra.global = true
ra.pattern = "<a[^>]+>(.+?)<\/a>"
regremovehref = replace(ra.replace(htmlstr,"$1"),"href=""http://www.927953.com""","")
end function
%>
效果图如下:
上一篇: Asp 解析 XML并分页显示源码
下一篇: ASP 数字分页效果代码