C#通过正则表达式实现提取网页中的图片

程序员文章站 2022-06-25 08:57:37

目前在做项目中有处理图片的部分，参考了一下网上案例，自己写了一个获取内容中的图片地址的方法。一般来说一个 html 文档有很多标签，比如“

目前在做项目中有处理图片的部分，参考了一下网上案例，自己写了一个获取内容中的图片地址的方法。

一般来说一个 html 文档有很多标签，比如“<html>”、“<body>”、“<table>”等，想把文档中的 img 标签提取出来并不是一件容易的事。由于 img 标签样式变化多端，使提取的时候用程序寻找并不容易。于是想要寻找它们就必须写一个非常健全的正则表达式，不然有可能会找得不全，或者找出来的不是正确的 img 标签。

我们可以从 html 标签的格式去想应该怎么建这个正则表达式。首先要想一下 img 标签有几种写法，忽略大小写不看的话，下面列出 img 标签可能出现的几种情况。
<img> <img/> <img src=/>

这一些标签不用考虑，因为没有图片资源地址。
<img src = /images/pic.jpg/ > <img src =" /images/pic.jpg" > <img src= '/images/pic.jpg ' / >

这一些标签都有图片资源地址，另外还有一个特点就是有引号对，可能为单引号，也可能为双引号。因为不需要同时匹配引号对，所以正则表达式可以这么写：@"<img\s*src\s*=\s*[""']?\s*(?[^\s""'<>]*)\s*/?\s*>"
<img width="320" height="240" src=/images/pic.jpg onclick="window.open('/images/pic.jpg')">

因为 img 和 src 之间可能会有其他的参数，所以“<img”要有个单词结束，比如说不能是“<imgabc”，同样 src 前面也是一样，使用单词结束符“\b”有一个好处就是省去了表示空格的“\s*”。另外由于 img 标签中不可以出现“<”、“>”这样的符号，所以要改写前面的正则表达式：@"<img\b[^<>]*?\bsrc\s*=\s*[""']?\s*(?<imgurl>[^\s""'<>]*)[^<>]*?/?\s*>"
<img width="320" height="240" src = "
/images/pic.jpg" />

像这种可能会用回车符折行的问题有时候会出现，所以在有空格分开的地方要包含回车换行和 tab 字符，另外在图片地址中不能出现空格、tab、回车和换行字符。

所以上面的正则表达式可以改成：@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"

下面写出取得html中所有图片地址的类hvthtmlimage：

using system.text.regularexpressions;
namespace hovertree.hovertreeframe.hvtimage
{
public class hvthtmlimage
{
/// <summary> 
/// 取得html中所有图片的 url。 
/// </summary> 
/// <param name="shtmltext">html代码</param> 
/// <returns>图片的url列表</returns> 
public static string[] gethvtimgurls(string shtmltext)
{
// 定义正则表达式用来匹配 img 标签 
regex m_hvtregimg = new regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", regexoptions.ignorecase);

// 搜索匹配的字符串 
matchcollection matches = m_hvtregimg.matches(shtmltext);
int m_i = 0;
string[] surllist = new string[matches.count];
// 取得匹配项列表 
foreach (match match in matches)
surllist[m_i++] = match.groups["imgurl"].value;
return surllist;
}
}
}

下面我们再来看一个例子

public array matchhtml(string html,string com)
    {
      list<string> urls = new list<string>();
      html = html.tolower();
      //获取src标签中的url
      regex regexsrc = new regex("src=\"[^\"]*[(.jpg)(.png)(.gif)(.bmp)(.ico)]\"");
      foreach(match m in regexsrc.matches(html))
      {
        string src = m.value;
        src = src.replace("src=","").replace("\"","");
        if (!src.contains("http"))
          src = com + src;
        if(!urls.contains(src))
        urls.add(src);
      }
      //获取href标签中url
      regex regexhref = new regex("href=\"[^\"]*[(.jpg)(.png)(.gif)(.bmp)(.ico)]\"");
      foreach (match m in regexhref.matches(html))
      {
        string href = m.value;
        href = href.replace("href=", "").replace("\"", "");
        if (!href.contains("http"))
          href = com + href;
        if(!urls.contains(href))
        urls.add(href);
      }
      return urls.toarray();
    }

[dllimport("kernel32.dll")]
    static extern bool setconsolemode(intptr hconsolehandle, int mode);
    [dllimport("kernel32.dll")]
    static extern bool getconsolemode(intptr hconsolehandle, out int mode);
    [dllimport("kernel32.dll")]
    static extern intptr getstdhandle(int handle);
    const int std_input_handle = -10;
    const int enable_quick_edit_mode = 0x40 | 0x80;
    public static void enablequickeditmode()
    {
      int mode; intptr handle = getstdhandle(std_input_handle);
      getconsolemode(handle, out mode);
      mode |= enable_quick_edit_mode;
      setconsolemode(handle, mode);
    }
    static void main(string[] args)
    {
      enablequickeditmode();
      int oldcount = 0;
      console.title = "takeimagefrominternet";
      string path = "e:\\download\\loading\\";
      while (true)
      {
        console.clear();
        string countfile = "e:\\countfile.txt";//用来计数的文本，以至于文件名不重复
        int cursor = 0;
        if (file.exists(countfile))
        {
          string text = file.readalltext(countfile);
          try
          {
            cursor =oldcount = convert.toint32(text);//次数多了建议使用long
          }
          catch { }
        }
        console.write("please input a url:");
        string url = "http://www.baidu.com/";
        string temp = console.readline();
        if (!string.isnullorempty(temp))
          url = temp;
        match mcom = new regex(@"^(?i)http://(\w+\.){2,3}(com(\.cn)?|cn|net)\b").match(url);//获取域名
        string com = mcom.value;
        //console.writeline(mcom.value);
        console.write("please input a save path:");
        temp = console.readline();
        if (directory.exists(temp))
          path = temp;
        console.writeline();
        webclient client = new webclient();
        byte[] htmldata = null;
        htmldata = client.downloaddata(url);
        memorystream mstream = new memorystream(htmldata);
        string html = "";
        using (streamreader sr = new streamreader(mstream))
        {
          html = sr.readtoend();
        }
        array urls = new matchhtmlimageurl().matchhtml(html,com);
 
        foreach (string imageurl in urls)
        {
         console.writeline(imageurl);
          byte[] imagedata = null;
          try
          {
            imagedata = client.downloaddata(imageurl);
          }
          catch { }
          if (imagedata != null && imagedata.length>0)
            using (memorystream ms = new memorystream(imagedata))
            {
              try
              {
                
                string ext = aping.utility.file.fileopration.extendname(imageurl);
                imageformat format = imageformat.jpeg;
                switch (ext)
                {
                  case ".jpg":
                    format = imageformat.jpeg;
                    break;
                  case ".bmp":
                    format = imageformat.bmp;
                    break;
                  case ".png":
                    format = imageformat.png;
                    break;
                  case ".gif":
                    format = imageformat.gif;
                    break;
                  case ".ico":
                    format = imageformat.icon;
                    break;
                  default:
                    continue;
                }
                image image = new bitmap(ms);
                if (directory.exists(path))
                  image.save(path + "\\" + cursor + ext, format);
              }
              catch(exception ex) { console.writeline(ex.message); }
            }
          cursor++;
        }
        mstream.close();
        file.writealltext(countfile, cursor.tostring(), encoding.utf8);
        console.writeline("take done...image count:"+(cursor-oldcount).tostring());
      }      
    }

上一篇：详解C#打开和关闭可执行文件

下一篇：如何使用Rotativa在ASP.NET Core MVC中创建PDF详解

C#通过正则表达式实现提取网页中的图片

C#中通过使用Connection类来实现打开/关闭数据库的代码实例

C#中通过LRU实现通用高效的超时连接探测

C#提取PPT文本和图片的实现方法

C#实现通过ffmpeg从flv视频文件中截图的方法

C#实现通过ffmpeg从flv视频文件中截图的方法

C#中通过使用Connection类来实现打开/关闭数据库的代码实例

C#中通过LRU实现通用高效的超时连接探测

C#基于正则表达式实现获取网页中所有信息的网页抓取类实例

C#实现winform中RichTextBox在指定光标位置插入图片的方法

通过正则表达式获取url中参数的简单实现