写了段批量抓取某个列表页的东东

程序员文章站 2023-01-29 13:06:51

有些人当抓取程序是个宝，到目前还tnd有人在卖钱，强烈bs一下这些家伙真是的！可能偶下边这段东西比较烂哈下边这个没有写入库功能，已经到这一步了，入库功能是很简单的事了，...

有些人当抓取程序是个宝，到目前还tnd有人在卖钱，强烈bs一下这些家伙真是的！可能偶下边这段东西比较烂哈
下边这个没有写入库功能，已经到这一步了，入库功能是很简单的事了，需要的请自己去完成吧，其它功能各位自行完善吧！把代码拷贝过去直接运行即可看到效果

dim url,list_pagecode,array_articleid,i,articleid
dim content_pagecode,content_tempcode
dim content_categoryid,content_categoryname,borderid,classid,bordername,classname
dim articletitle,articleauthor,articlefrom,articlecontent

url = "http://www.webasp.net/article/class/1.htm"
list_pagecode = gethttppage(url)
list_pagecode = regexptext(list_pagecode,"打印</th></tr>","</table><table border=0 cellpadding=5",0)
list_pagecode = regexptext(list_pagecode,"<td align=left><a href='../","'><img border=0 src='../images/authortype0.gif'",1) '取得当前列表页的文章链接，以,分隔
array_articleid = split(list_pagecode,",") '创建数组，存储文章id

for i=0 to ubound(array_articleid)-1
    articleid = array_articleid(i)    '文章id
    content_pagecode = gethttppage("http://www.webasp.net/article/"&articleid)    '取得文章页的内容

    '=========取文章分类及相关id参数开始=======================
    content_tempcode = regexptext(content_pagecode,"<a href=""/article/"">技术教程</a> >> ",">> 内容</td>",0)
    content_categoryid = regexptext(content_pagecode,"<a href='../class","/'>",1)
    borderid = split(content_categoryid,",")(0)    '大类id
    classid = split(content_categoryid,",")(1)    '子类id
        '==========检查大类是否存在开始===============
        '如果不存在则入库

        '==========检查大类是否存在结束===============
    'response.write(borderid & "," & classid & "<br />")
    content_categoryname = regexptext(content_pagecode,"/'>","</a>",1)
    bordername = split(content_categoryname,",")(0)    '大类名称
    classname = split(content_categoryname,",")(1)    '子类名称
        '==========检查子类是否存在开始===============
        '如果不存在则入库

        '==========检查子类是否存在结束===============
    '=========取文章分类及相关id参数结束=======================

    '=========取文章标题及内容开始=============================
    articletitle = regexptext(content_pagecode,"<tr><td align=center bgcolor=#dee2f5><strong>","</strong></td></tr>",0)
    articleauthor = regexptext(content_pagecode,"<tr><td><span class=blue>作者：</span>","</td></tr>",0)
    articlefrom = regexptext(content_pagecode,"<tr><td><span class=blue>来源：</span>","</td></tr>",0)
    articlecontent = regexptext(content_pagecode,"<tr><td class=content style=""word-wrap: break-word"" id=zoom>","</td></tr>"&vbcrlf&"        </table>"&vbcrlf&"    </td></tr></table>",0)
    '=========取文章标题及内容结束=============================
    response.write(articletitle& "<br /><br />")
    response.flush()
next

附几个函数：

function gethttppage(url)
    if(isobjinstalled("microsoft.xmlhttp") = false)then
        response.write "<br><br>服务器不支持microsoft.xmlhttp组件"
        err.clear
        response.end
    end if
    on error resume next
    dim http
    set http=server.createobject("msxml2.xmlhttp")
    http.open "get",url,false
    http.send()
    if(http.readystate<>4)then
        exit function
    end if
    gethttppage=bytestobstr(http.responsebody,"gb2312")
    set http=nothing
    if(err.number<>0)then
        response.write "<br><br>获取文件内容出错"
        'response.end
        err.clear
    end if
end function

function bytestobstr(codebody,codeset)
    dim objstream
    set objstream = server.createobject("adodb.stream")
    objstream.type = 1
    objstream.mode =3
    objstream.open
    objstream.write codebody
    objstream.position = 0
    objstream.type = 2
    objstream.charset = codeset
    bytestobstr = objstream.readtext
    objstream.close
    set objstream = nothing
end function

'================================================
'作用：检查组件是否已经安装
'返回值：true ----已经安装
'        false ----没有安装
'================================================
function isobjinstalled(objname)
    on error resume next
    isobjinstalled = false
    err = 0
    dim testobj
    set testobj = server.createobject(objname)
    if(0 = err)then isobjinstalled = true
    set testobj = nothing
    err = 0
end function

function regexptext(strng,strstart,strend,n)
    dim regex,match,matches,retstr
    set regex = new regexp
    regex.pattern = strstart&"([\s\s]*?)"&strend
    regex.ignorecase = true
    regex.global = true
    set matches = regex.execute(strng)
    for each match in matches
        if(n=1)then
            retstr = retstr & regex.replace(match.value,"$1") & ","
        else
            retstr = retstr & regex.replace(match.value,"$1")
        end if
    next
    regexptext = retstr
    set regex=nothing
end function

上一篇：基于echarts实现实时数据传输效果

下一篇：司马辽太郎：日本作家，取自“远不及司马迁之太郎”之意