欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

采集网页图片代码

程序员文章站 2022-03-30 21:16:57
采集网页上图片的主要关键是在怎么解析出页面代码里那些img标签的src属性,在网上找了下大多都是通过字符串操作找出img标签,这种方式操作起来比较麻烦,而且代码看起来比较累。这里我...

采集网页上图片的主要关键是在怎么解析出页面代码里那些img标签的src属性,在网上找了下大多都是通过字符串操作找出img标签,这种方式操作起来比较麻烦,而且代码看起来比较累。这里我用的方法是通过webbrowser来加载一个页面,然后htmldocument类来操作省去了字符串操作的步骤,直接调用getelementsbytagname把所有图片地址返回到一个htmlelementcollection对象里。
代码如下:
using system;
using system.collections.generic;
using system.linq;
using system.text;
using system.text.regularexpressions;
using system.net;
using system.io;
using system.windows.forms;
namespace windowsformsapplication1
{
    public class gatherpic
    {
        private string savepath;
        private string geturl;
        private webbrowser wb;
        private int iimgcount;
        //初始化参数
        public gatherpic(string sweburl, string ssavepath)
        {
            this.geturl = sweburl;
            this.savepath = ssavepath;
        }
        //开始采集
        public bool start()
        {
            if (geturl.trim().equals(""))
            {
                messagebox.show("哪来的虾米连网址都没输!");
                return false;
            }
            this.wb = new webbrowser();
            this.wb.navigate(geturl);
            //委托事件
            this.wb.documentcompleted += new system.windows.forms.webbrowserdocumentcompletedeventhandler(documentcompleted);
            return true;
        }
        //webbrowser.documentcompleted委托事件
        private void documentcompleted(object sender, webbrowserdocumentcompletedeventargs e)
        {
            //页面里框架iframe加载完成不掉用searchimglist()
            if (e.url != wb.document.url) return;
            searchimglist();
        }
        //检查出所有图片并采集到本地
        public void searchimglist()
        {
            string simgurl;
            //取得所有图片地址
            htmlelementcollection elemcoll = this.wb.document.getelementsbytagname("img");
            this.iimgcount = elemcoll.count;
            foreach (htmlelement elem in elemcoll)
            {
                simgurl = elem.getattribute("src");
                //调用保存远程图片函数
                saveimagefromweb(simgurl, this.savepath);
            }
        }
        //保存远程图片函数
        public int saveimagefromweb(string imgurl, string path)
        {
            string imgname = imgurl.tostring().substring(imgurl.tostring().lastindexof("/") + 1);
            path = path + "\\" + imgname;
            string defaulttype = ".jpg";
            string[] imgtypes = new string[] { ".jpg", ".jpeg", ".png", ".gif", ".bmp" };
            string imgtype = imgurl.tostring().substring(imgurl.tostring().lastindexof("."));
            foreach (string it in imgtypes)
            {
                if (imgtype.tolower().equals(it))
                    break;
                if (it.equals(".bmp"))
                    imgtype = defaulttype;
            }
            try
            {
                httpwebrequest request = (httpwebrequest)webrequest.create(imgurl);
                request.useragent = "mozilla/6.0 (msie 6.0; windows nt 5.1; natas.robot)";
                request.timeout = 10000;
                webresponse response = request.getresponse();
                stream stream = response.getresponsestream();
                if (response.contenttype.tolower().startswith("image/"))
                {
                    byte[] arraybyte = new byte[1024];
                    int imglong = (int)response.contentlength;
                    int l = 0;
                    // createdirectory(path);
                    filestream fso = new filestream(path, filemode.create);
                    while (l < imglong)
                    {
                        int i = stream.read(arraybyte, 0, 1024);
                        fso.write(arraybyte, 0, i);
                        l += i;
                    }
                    fso.close();
                    stream.close();
                    response.close();
                    return 1;
                }
                else
                {
                    return 0;
                }
            }
            catch (webexception)
            {
                return 0;
            }
            catch (uriformatexception)
            {
                return 0;
            }
        }
    }
}
//-----------------调用代码--------------------
gatherpic gatherpic = new gatherpic(“https://www.baidu.com”,"c:\test");
//请确保c:\下存在test路径
gatherpic.start()

 

 

摘自 与时俱进