采集网页图片代码
采集网页上图片的主要关键是在怎么解析出页面代码里那些img标签的src属性,在网上找了下大多都是通过字符串操作找出img标签,这种方式操作起来比较麻烦,而且代码看起来比较累。这里我用的方法是通过webbrowser来加载一个页面,然后htmldocument类来操作省去了字符串操作的步骤,直接调用getelementsbytagname把所有图片地址返回到一个htmlelementcollection对象里。
代码如下:
using system;
using system.collections.generic;
using system.linq;
using system.text;
using system.text.regularexpressions;
using system.net;
using system.io;
using system.windows.forms;
namespace windowsformsapplication1
{
public class gatherpic
{
private string savepath;
private string geturl;
private webbrowser wb;
private int iimgcount;
//初始化参数
public gatherpic(string sweburl, string ssavepath)
{
this.geturl = sweburl;
this.savepath = ssavepath;
}
//开始采集
public bool start()
{
if (geturl.trim().equals(""))
{
messagebox.show("哪来的虾米连网址都没输!");
return false;
}
this.wb = new webbrowser();
this.wb.navigate(geturl);
//委托事件
this.wb.documentcompleted += new system.windows.forms.webbrowserdocumentcompletedeventhandler(documentcompleted);
return true;
}
//webbrowser.documentcompleted委托事件
private void documentcompleted(object sender, webbrowserdocumentcompletedeventargs e)
{
//页面里框架iframe加载完成不掉用searchimglist()
if (e.url != wb.document.url) return;
searchimglist();
}
//检查出所有图片并采集到本地
public void searchimglist()
{
string simgurl;
//取得所有图片地址
htmlelementcollection elemcoll = this.wb.document.getelementsbytagname("img");
this.iimgcount = elemcoll.count;
foreach (htmlelement elem in elemcoll)
{
simgurl = elem.getattribute("src");
//调用保存远程图片函数
saveimagefromweb(simgurl, this.savepath);
}
}
//保存远程图片函数
public int saveimagefromweb(string imgurl, string path)
{
string imgname = imgurl.tostring().substring(imgurl.tostring().lastindexof("/") + 1);
path = path + "\\" + imgname;
string defaulttype = ".jpg";
string[] imgtypes = new string[] { ".jpg", ".jpeg", ".png", ".gif", ".bmp" };
string imgtype = imgurl.tostring().substring(imgurl.tostring().lastindexof("."));
foreach (string it in imgtypes)
{
if (imgtype.tolower().equals(it))
break;
if (it.equals(".bmp"))
imgtype = defaulttype;
}
try
{
httpwebrequest request = (httpwebrequest)webrequest.create(imgurl);
request.useragent = "mozilla/6.0 (msie 6.0; windows nt 5.1; natas.robot)";
request.timeout = 10000;
webresponse response = request.getresponse();
stream stream = response.getresponsestream();
if (response.contenttype.tolower().startswith("image/"))
{
byte[] arraybyte = new byte[1024];
int imglong = (int)response.contentlength;
int l = 0;
// createdirectory(path);
filestream fso = new filestream(path, filemode.create);
while (l < imglong)
{
int i = stream.read(arraybyte, 0, 1024);
fso.write(arraybyte, 0, i);
l += i;
}
fso.close();
stream.close();
response.close();
return 1;
}
else
{
return 0;
}
}
catch (webexception)
{
return 0;
}
catch (uriformatexception)
{
return 0;
}
}
}
}
//-----------------调用代码--------------------
gatherpic gatherpic = new gatherpic(“https://www.baidu.com”,"c:\test");
//请确保c:\下存在test路径
gatherpic.start()
摘自 与时俱进
推荐阅读