详解JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片
程序员文章站
2024-03-11 13:13:13
利用java抓取网页上的所有图片:
用两个正则表达式:
1、匹配html中img标签的正则:]*?...
利用java抓取网页上的所有图片:
用两个正则表达式:
1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>
2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)
实现:
package org.swinglife.main; import java.io.file; import java.io.fileoutputstream; import java.io.inputstream; import java.net.url; import java.net.urlconnection; import java.util.arraylist; import java.util.list; import java.util.regex.matcher; import java.util.regex.pattern; /*** * java抓取网络图片 * @author swinglife * */ public class catchimage { // 地址 private static final string url = "http://www.csdn.net"; // 编码 private static final string ecoding = "utf-8"; // 获取img标签正则 private static final string imgurl_reg = "<img.*src=(.*?)[^>]*?>"; // 获取src路径的正则 private static final string imgsrc_reg = "http:\"?(.*?)(\"|>|\\s+)"; public static void main(string[] args) throws exception { catchimage cm = new catchimage(); //获得html文本内容 string html = cm.gethtml(url); //获取图片标签 list<string> imgurl = cm.getimageurl(html); //获取图片src地址 list<string> imgsrc = cm.getimagesrc(imgurl); //下载图片 cm.download(imgsrc); } /*** * 获取html内容 * * @param url * @return * @throws exception */ private string gethtml(string url) throws exception { url uri = new url(url); urlconnection connection = uri.openconnection(); inputstream in = connection.getinputstream(); byte[] buf = new byte[1024]; int length = 0; stringbuffer sb = new stringbuffer(); while ((length = in.read(buf, 0, buf.length)) > 0) { sb.append(new string(buf, ecoding)); } in.close(); return sb.tostring(); } /*** * 获取imageurl地址 * * @param html * @return */ private list<string> getimageurl(string html) { matcher matcher = pattern.compile(imgurl_reg).matcher(html); list<string> listimgurl = new arraylist<string>(); while (matcher.find()) { listimgurl.add(matcher.group()); } return listimgurl; } /*** * 获取imagesrc地址 * * @param listimageurl * @return */ private list<string> getimagesrc(list<string> listimageurl) { list<string> listimgsrc = new arraylist<string>(); for (string image : listimageurl) { matcher matcher = pattern.compile(imgsrc_reg).matcher(image); while (matcher.find()) { listimgsrc.add(matcher.group().substring(0, matcher.group().length() - 1)); } } return listimgsrc; } /*** * 下载图片 * * @param listimgsrc */ private void download(list<string> listimgsrc) { try { for (string url : listimgsrc) { string imagename = url.substring(url.lastindexof("/") + 1, url.length()); url uri = new url(url); inputstream in = uri.openstream(); fileoutputstream fo = new fileoutputstream(new file(imagename)); byte[] buf = new byte[1024]; int length = 0; system.out.println("开始下载:" + url); while ((length = in.read(buf, 0, buf.length)) != -1) { fo.write(buf, 0, length); } in.close(); fo.close(); system.out.println(imagename + "下载完成"); } } catch (exception e) { system.out.println("下载失败"); } } }
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: C# Base64编码
下一篇: Java中随机数的产生方式与原理详解