写了段批量抓取某个列表页的东东
有些人当抓取程序是个宝,到目前还tnd有人在卖钱,强烈bs一下这些家伙 真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果
dim url,list_pagecode,array_articleid,i,articleid
dim content_pagecode,content_tempcode
dim content_categoryid,content_categoryname,borderid,classid,bordername,classname
dim articletitle,articleauthor,articlefrom,articlecontent
url = "http://www.webasp.net/article/class/1.htm"
list_pagecode = gethttppage(url)
list_pagecode = regexptext(list_pagecode,"打印</th></tr>","</table><table border=0 cellpadding=5",0)
list_pagecode = regexptext(list_pagecode,"<td align=left><a href='../","'><img border=0 src='../images/authortype0.gif'",1) '取得当前列表页的文章链接,以,分隔
array_articleid = split(list_pagecode,",") '创建数组,存储文章id
for i=0 to ubound(array_articleid)-1
articleid = array_articleid(i) '文章id
content_pagecode = gethttppage("http://www.webasp.net/article/"&articleid) '取得文章页的内容
'=========取文章分类及相关id参数 开始=======================
content_tempcode = regexptext(content_pagecode,"<a href=""/article/"">技术教程</a> >> ",">> 内容</td>",0)
content_categoryid = regexptext(content_pagecode,"<a href='../class","/'>",1)
borderid = split(content_categoryid,",")(0) '大类id
classid = split(content_categoryid,",")(1) '子类id
'==========检查大类是否存在 开始===============
'如果不存在则入库
'==========检查大类是否存在 结束===============
'response.write(borderid & "," & classid & "<br />")
content_categoryname = regexptext(content_pagecode,"/'>","</a>",1)
bordername = split(content_categoryname,",")(0) '大类名称
classname = split(content_categoryname,",")(1) '子类名称
'==========检查子类是否存在 开始===============
'如果不存在则入库
'==========检查子类是否存在 结束===============
'=========取文章分类及相关id参数 结束=======================
'=========取文章标题及内容 开始=============================
articletitle = regexptext(content_pagecode,"<tr><td align=center bgcolor=#dee2f5><strong>","</strong></td></tr>",0)
articleauthor = regexptext(content_pagecode,"<tr><td><span class=blue>作者:</span>","</td></tr>",0)
articlefrom = regexptext(content_pagecode,"<tr><td><span class=blue>来源:</span>","</td></tr>",0)
articlecontent = regexptext(content_pagecode,"<tr><td class=content style=""word-wrap: break-word"" id=zoom>","</td></tr>"&vbcrlf&" </table>"&vbcrlf&" </td></tr></table>",0)
'=========取文章标题及内容 结束=============================
response.write(articletitle& "<br /><br />")
response.flush()
next
附几个函数:
function gethttppage(url)
if(isobjinstalled("microsoft.xmlhttp") = false)then
response.write "<br><br>服务器不支持microsoft.xmlhttp组件"
err.clear
response.end
end if
on error resume next
dim http
set http=server.createobject("msxml2.xmlhttp")
http.open "get",url,false
http.send()
if(http.readystate<>4)then
exit function
end if
gethttppage=bytestobstr(http.responsebody,"gb2312")
set http=nothing
if(err.number<>0)then
response.write "<br><br>获取文件内容出错"
'response.end
err.clear
end if
end function
function bytestobstr(codebody,codeset)
dim objstream
set objstream = server.createobject("adodb.stream")
objstream.type = 1
objstream.mode =3
objstream.open
objstream.write codebody
objstream.position = 0
objstream.type = 2
objstream.charset = codeset
bytestobstr = objstream.readtext
objstream.close
set objstream = nothing
end function
'================================================
'作 用:检查组件是否已经安装
'返回值:true ----已经安装
' false ----没有安装
'================================================
function isobjinstalled(objname)
on error resume next
isobjinstalled = false
err = 0
dim testobj
set testobj = server.createobject(objname)
if(0 = err)then isobjinstalled = true
set testobj = nothing
err = 0
end function
function regexptext(strng,strstart,strend,n)
dim regex,match,matches,retstr
set regex = new regexp
regex.pattern = strstart&"([\s\s]*?)"&strend
regex.ignorecase = true
regex.global = true
set matches = regex.execute(strng)
for each match in matches
if(n=1)then
retstr = retstr & regex.replace(match.value,"$1") & ","
else
retstr = retstr & regex.replace(match.value,"$1")
end if
next
regexptext = retstr
set regex=nothing
end function