欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Java爬虫框架 WebCollector-2.7.3 爬取网页图片Demo

程序员文章站 2022-05-27 08:29:14
...

WebCollector框架Github地址:https://github.com/CrawlScript/WebCollector

Demo源码

package com.collector;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import okhttp3.Request;

import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Crawling picture from web pages
 *
 * @author he
 */
public class AutoPicCrawler extends BreadthCrawler {
    
    File downloadDir;
	
	private final static String downPath = "E:/toolSource/picFile";
	
	private final static String seed = "https://www.csdn.net";
	
	public static class MyRequester extends OkHttpRequester {
		 
        String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
        String cookie = "JSESSIONID=asdasdasdasdasdasdasdsadsa";
        // 每次发送请求前都会执行这个方法来构建请求
        @Override
        public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
            return super.createRequestBuilder(crawlDatum)
                    .addHeader("User-Agent", userAgent)
                    .addHeader("Cookie", cookie);
        }
 
    }
	
    public AutoPicCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);      
        downloadDir = new File(downPath);
        if (!downloadDir.exists()) {
            downloadDir.mkdirs();
        }
        this.setRequester(new MyRequester());
        this.addSeed(seed);
        setThreads(50);
        getConf().setTopN(100);

    }

    @Override
    public void visit(Page page, CrawlDatums next) {
    	if(page.code() == 301 || page.code() == 302){
            next.addAndReturn(page.location()).meta(page.meta());
            return;
        }

        String url = page.url();
        System.out.println("url:"+url);
        String contentType = page.contentType();
        System.out.println("contentType:"+contentType);
        if (contentType == null) {
            return;
        } else if (contentType.contains("html")) {
            // 如果是网页,则抽取其中包含图片的URL,放入后续任务
            Elements imgs = page.select("img[src]");
            for (Element img : imgs) {
                String imgSrc = img.attr("abs:src");
                System.out.println("imgSrc:"+imgSrc);
                next.add(imgSrc);
            }
 
        } else if (contentType.startsWith("image")) {
            // 如果是图片,直接下载
            String extensionName = contentType.split("/")[1];
            String imageFileName = getTimeCodeName() + "." + extensionName;
            File imageFile = new File(downloadDir, imageFileName);
            try {
                FileUtils.write(imageFile, page.content());
                System.out.println("保存图片 " + page.url() + " 到 " + imageFile.getAbsolutePath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
        
    }

    public static void main(String[] args) throws Exception {
    	AutoPicCrawler crawler = new AutoPicCrawler("crawl", true);
        /*start crawl with depth of 4*/
        crawler.start(4);
    }
    
    /**
     * @return create filename by system time
     */
    public static String getTimeCodeName(){
		Date d=new Date();
		DateFormat sdf=new SimpleDateFormat("yyMMddHHmmssSSS");
		String s=sdf.format(d);
		int code=(int) ((Math.random()*9+1)*100);
		String cods=s+code;
		return cods;
	}
    
}

执行效果

Java爬虫框架 WebCollector-2.7.3 爬取网页图片Demo