asp 采集实战代码
程序员文章站
2023-10-27 14:00:28
最近实在是太流行采集了,本人是不喜欢采集的,但对采集的原理我却很有兴趣进行研究,拿到了网上采集常用函数,对其进行了一番研究,并实战,结果成功,撇开效率问题,采集原理并不复杂...
最近实在是太流行采集了,本人是不喜欢采集的,但对采集的原理我却很有兴趣进行研究,拿到了网上采集常用函数,对其进行了一番研究,并实战,结果成功,撇开效率问题,采集原理并不复杂,大家可以在搜索吧输入“采集”查看其原理。下面是一个采集的例子:
<%@language="vbscript" codepage="65001"%>
<% response.codepage=65001%>
<% response.charset="utf-8" %>
<%server.scripttimeout=9999999
response.expires = 0
response.expiresabsolute = now() - 1
response.addheader "pragma","no-cache"
response.addheader "cache-control","private"
response.cachecontrol = "no-cache"
%>
<%
'声明取得目标信息的函数,通过xml组件进行实现。
function geturl(url)
set retrieval = server.createobject("msxml2.xmlhttp")
with retrieval
.open "get", url, false
.send
if .status<>200 then '判断文档是否已经解析完,以做客户端接受返回消息
exit function
end if
' 二进制转字符串
geturl = stb(.responsebody)
end with
'对取得信息进行验证,如果信息长度小于100则说明截取失败
end function
' 二进制转字符串,否则会出现乱码的!
function stb(vin)
const adtypetext = 2
dim bytesstream,stringreturn
set bytesstream = server.createobject("adodb.stream")
with bytesstream
.type = adtypetext
.open
.writetext vin
.position = 0
.charset = "gb2312"
.position = 2
stringreturn = .readtext
.close
end with
set bytesstream = nothing
stb = stringreturn
end function
function newstring(wstr,strng)
newstring=instr(lcase(wstr),lcase(strng))
if newstring<=0 then newstring=len(wstr)
end function
'声明截取的格式,从start开始截取,到over为结束
function getkey(html,start,over)
start=newstring(html,start)
over=newstring(html,over)
getkey=mid(html,start,over-start)
end function
dim softid,url,html,title
'采集百度知道
for i = 1 to 100
url="http://zhidao.baidu.com/question/10000"&i&".html"
html = geturl(url)
question = getkey(html,"<cq>","</cq>")
answer = getkey(html,"<ca>","</ca>")
response.write(question&"<br />")
response.write(answer)
response.write("采集成功")
next
'打开数据库,准备入库
'dim connstr,conn,rs,sql
'connstr="dbq="+server.mappath("db1.mdb")+";defaultdir=;driver={microsoft access driver (*.mdb)};"
'set conn=server.createobject("adodb.connection")
'conn.open connstr
'set rs=server.createobject("adodb.recordset")
'sql="select [列名] from [表名] where [列名]='"&title&"'"
'rs.open sql,conn,3,3
'if rs.eof and rs.bof then
'rs("列名")=title
'rs.update
'set rs=nothing
'end if
'set rs=nothing
%>
复制代码 代码如下:
<%@language="vbscript" codepage="65001"%>
<% response.codepage=65001%>
<% response.charset="utf-8" %>
<%server.scripttimeout=9999999
response.expires = 0
response.expiresabsolute = now() - 1
response.addheader "pragma","no-cache"
response.addheader "cache-control","private"
response.cachecontrol = "no-cache"
%>
<%
'声明取得目标信息的函数,通过xml组件进行实现。
function geturl(url)
set retrieval = server.createobject("msxml2.xmlhttp")
with retrieval
.open "get", url, false
.send
if .status<>200 then '判断文档是否已经解析完,以做客户端接受返回消息
exit function
end if
' 二进制转字符串
geturl = stb(.responsebody)
end with
'对取得信息进行验证,如果信息长度小于100则说明截取失败
end function
' 二进制转字符串,否则会出现乱码的!
function stb(vin)
const adtypetext = 2
dim bytesstream,stringreturn
set bytesstream = server.createobject("adodb.stream")
with bytesstream
.type = adtypetext
.open
.writetext vin
.position = 0
.charset = "gb2312"
.position = 2
stringreturn = .readtext
.close
end with
set bytesstream = nothing
stb = stringreturn
end function
function newstring(wstr,strng)
newstring=instr(lcase(wstr),lcase(strng))
if newstring<=0 then newstring=len(wstr)
end function
'声明截取的格式,从start开始截取,到over为结束
function getkey(html,start,over)
start=newstring(html,start)
over=newstring(html,over)
getkey=mid(html,start,over-start)
end function
dim softid,url,html,title
'采集百度知道
for i = 1 to 100
url="http://zhidao.baidu.com/question/10000"&i&".html"
html = geturl(url)
question = getkey(html,"<cq>","</cq>")
answer = getkey(html,"<ca>","</ca>")
response.write(question&"<br />")
response.write(answer)
response.write("采集成功")
next
'打开数据库,准备入库
'dim connstr,conn,rs,sql
'connstr="dbq="+server.mappath("db1.mdb")+";defaultdir=;driver={microsoft access driver (*.mdb)};"
'set conn=server.createobject("adodb.connection")
'conn.open connstr
'set rs=server.createobject("adodb.recordset")
'sql="select [列名] from [表名] where [列名]='"&title&"'"
'rs.open sql,conn,3,3
'if rs.eof and rs.bof then
'rs("列名")=title
'rs.update
'set rs=nothing
'end if
'set rs=nothing
%>