Java爬虫框架 WebCollector-2.7.3 爬取网页图片Demo
程序员文章站
2022-05-27 08:29:14
...
WebCollector框架Github地址:https://github.com/CrawlScript/WebCollector
Demo源码
package com.collector;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import okhttp3.Request;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Crawling picture from web pages
*
* @author he
*/
public class AutoPicCrawler extends BreadthCrawler {
File downloadDir;
private final static String downPath = "E:/toolSource/picFile";
private final static String seed = "https://www.csdn.net";
public static class MyRequester extends OkHttpRequester {
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
String cookie = "JSESSIONID=asdasdasdasdasdasdasdsadsa";
// 每次发送请求前都会执行这个方法来构建请求
@Override
public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
return super.createRequestBuilder(crawlDatum)
.addHeader("User-Agent", userAgent)
.addHeader("Cookie", cookie);
}
}
public AutoPicCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
downloadDir = new File(downPath);
if (!downloadDir.exists()) {
downloadDir.mkdirs();
}
this.setRequester(new MyRequester());
this.addSeed(seed);
setThreads(50);
getConf().setTopN(100);
}
@Override
public void visit(Page page, CrawlDatums next) {
if(page.code() == 301 || page.code() == 302){
next.addAndReturn(page.location()).meta(page.meta());
return;
}
String url = page.url();
System.out.println("url:"+url);
String contentType = page.contentType();
System.out.println("contentType:"+contentType);
if (contentType == null) {
return;
} else if (contentType.contains("html")) {
// 如果是网页,则抽取其中包含图片的URL,放入后续任务
Elements imgs = page.select("img[src]");
for (Element img : imgs) {
String imgSrc = img.attr("abs:src");
System.out.println("imgSrc:"+imgSrc);
next.add(imgSrc);
}
} else if (contentType.startsWith("image")) {
// 如果是图片,直接下载
String extensionName = contentType.split("/")[1];
String imageFileName = getTimeCodeName() + "." + extensionName;
File imageFile = new File(downloadDir, imageFileName);
try {
FileUtils.write(imageFile, page.content());
System.out.println("保存图片 " + page.url() + " 到 " + imageFile.getAbsolutePath());
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
}
public static void main(String[] args) throws Exception {
AutoPicCrawler crawler = new AutoPicCrawler("crawl", true);
/*start crawl with depth of 4*/
crawler.start(4);
}
/**
* @return create filename by system time
*/
public static String getTimeCodeName(){
Date d=new Date();
DateFormat sdf=new SimpleDateFormat("yyMMddHHmmssSSS");
String s=sdf.format(d);
int code=(int) ((Math.random()*9+1)*100);
String cods=s+code;
return cods;
}
}
执行效果
上一篇: html表单导出为word文件(内含图片和CKeditor富文本框)
下一篇: 打麻将输了