欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

JAVA爬虫撸图片

程序员文章站 2023-11-13 22:19:58
JAVA爬虫撸图片Httpclient+jsouppackage cn.fu.spider;import org.apache.http.HttpEntity;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import o...

JAVA爬虫撸图片

Httpclient+jsoup

package cn.fu.spider;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;

public class HttpclientPoolDemo {
    static String file = "F://paqu";//下载的目标路径
    public static void main(String[] args) {
    //http://www.8888.com/xx/1/,http://www.8888.com/xx/2/,http://www.8888.com/xx/3/
        String url = "http://**********.html"+"/";
       //总页数每页9图 总共193张图
        int pages= 193/9;
         PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

        //设置最大连接数
        cm.setMaxTotal(50);
//        设置每个主机最大连接数
        cm.setDefaultMaxPerRoute(10);
        List<String> list = doGet(cm, url, pages);
        for (String s : list) {
            downImage(s,s.substring(40));
        }
    }

    private static List<String> doGet(PoolingHttpClientConnectionManager cm, String url, int pages) {

        ArrayList<String> list = new ArrayList<>();
        ArrayList<String> list1 = new ArrayList<>();
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        
        for (int i = 1; i < pages; i++) {
            HttpGet httpGet = new HttpGet(url + i);
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
            } catch (IOException e) {
                e.printStackTrace();
            }
            HttpEntity entity = response.getEntity();
            String s = null;
            try {
                s = EntityUtils.toString(entity, "UTF-8");
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            Document document = Jsoup.parse(s);
            Elements elements = document.getElementsByClass("alignnone");
            for (Element element : elements) {
                String attr = element.attr("data-lazy-src");
                list1.add(attr);
            }
        }
        for (String s : list1) {
            if (s != "" && s != null) {
                list.add(s);
            }
        }
       return list;
    }
    public static void downImage(String imgurl, String fileName) {
        //判断目标文件夹是否存在
        File files = new File(file);
        if (!files.exists()) {
            files.mkdirs();
        }
        InputStream is;
        FileOutputStream out;
        try {
            URL url = new URL(imgurl);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            is = connection.getInputStream();
            // 创建文件
            File fileofImg = new File(file + "/" + fileName);
            out = new FileOutputStream(fileofImg);
            int i = 0;
            while ((i = is.read()) != -1) {
                out.write(i);
            }
            is.close();
            out.close();
        } catch (MalformedURLException e) {
            
            e.printStackTrace();
        } catch (FileNotFoundException e) {
           
            e.printStackTrace();
        } catch (IOException e) {
         
            e.printStackTrace();
        }

    }

}

JAVA爬虫撸图片

本文地址:https://blog.csdn.net/fu4562018/article/details/112098690

相关标签: 爬虫 java