java-PDF读取一页某一区域内容

程序员文章站 2022-04-19 21:24:42

...

首先还是我的风格，先看效果

在某一页pdf中获取某一块区域的内容和图片

java-PDF读取一页某一区域内容

这样就能获取想要的东西了

上代码

这里需要注意maven 是两部分
java-PDF读取一页某一区域内容

	<!-- pdf转换 -->
        <dependency>
            <groupId>e-iceblue</groupId>
            <artifactId>spire.pdf</artifactId>
            <version>3.4.2</version>
        </dependency>
    
    <!-- pdf转换 -->
    <repositories>
        <repository>
            <id>com.e-iceblue</id>
            <url>http://repo.e-iceblue.cn/repository/maven-public/</url>
        </repository>
    </repositories>

一个完整的工具类

import com.spire.pdf.*;
import com.spire.pdf.exporting.PdfImageInfo;
import com.spire.pdf.graphics.PdfGraphicsUnit;
import com.spire.pdf.graphics.PdfUnitConvertor;

import javax.imageio.ImageIO;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;

/**
 * @author: 云
 * @date: 2020/8/24 20:01
 * @version: 1.0
 */
public class cs {

    public static void main(String[] args) throws IOException {
        //pdf位置
        String input = "C:\\Users\\zhangyunhao\\Desktop\\1231.pdf";

        //Load the PDF file
        PdfDocument pdf = new PdfDocument();
        pdf.loadFromFile(input);

        //Create a new txt file to save the extracted text
        String result = "D:\\ExtractText.txt"; //存入txt
        File file = new File(result);
        if (!file.exists()) {
            file.delete();
        }
        file.createNewFile();
        FileWriter fw = new FileWriter(file, true);
        BufferedWriter bw = new BufferedWriter(fw);

        //Get the first page
        PdfPageBase page = pdf.getPages().get(0);

        /**
         * 这里是需要注意的
         * 这里的 磅 是pdf的 单位
         * 
         * 我们只能通过像素来转换  在测量的时候页面一样要100% 不然会有误差
         * 我们只能通过像素来转换  在测量的时候页面一样要100% 不然会有误差
         * 我们只能通过像素来转换  在测量的时候页面一样要100% 不然会有误差
         *
         * pdf的坐标是从pdf的左上角开始，而不是页面的左上角
         * pdf的坐标是从pdf的左上角开始，而不是页面的左上角
         * pdf的坐标是从pdf的左上角开始，而不是页面的左上角
         *
         */

        //磅  转 像素
//        PdfUnitConvertor unitCvtr = new PdfUnitConvertor();
//        float pixelWidth = unitCvtr.convertUnits((float) 514, PdfGraphicsUnit.Point, PdfGraphicsUnit.Pixel);
//        float pixelHeight = unitCvtr.convertUnits((float) 261, PdfGraphicsUnit.Point, PdfGraphicsUnit.Pixel);
        
        //像素 转 磅
        PdfUnitConvertor unitCvtr = new PdfUnitConvertor();
        //                                           这里就是测量的像素
        float pointWidth = unitCvtr.convertUnits((float) 1116, PdfGraphicsUnit.Pixel, PdfGraphicsUnit.Point);
        float pointHeight = unitCvtr.convertUnits((float) 831, PdfGraphicsUnit.Pixel, PdfGraphicsUnit.Point);
        float x = unitCvtr.convertUnits((float) 0, PdfGraphicsUnit.Pixel, PdfGraphicsUnit.Point);
        float y = unitCvtr.convertUnits((float) 1072, PdfGraphicsUnit.Pixel, PdfGraphicsUnit.Point);

        //Extract text from a specific rectangular area within the page
        // 需要 x y 宽度 高度 四个参数 来确定矩形位置
        String text = page.extractText(new Rectangle2D.Float(x, y, pointWidth, pointHeight));
        System.out.println(pointWidth + "*" + pointHeight);
        //这个可以获取到某个页面的图片的具体信息 输出的 x,y 都是左上角
        PdfPageBase firstPage = pdf.getPages().get(0);
        PdfImageInfo[] imageInfo = firstPage.getImagesInfo();
        //循环 比较
        Rectangle2D bounds = null;
        for (int i = 0; i < imageInfo.length; i++) {
            bounds = imageInfo[i].getBounds();
            //判断 图片是否在选中的区域里
            if (bounds.getX() <= pointWidth && bounds.getY() <= pointHeight + y && bounds.getX() >= x && bounds.getY() >= y && bounds.getWidth() <= pointWidth && bounds.getHeight() <= pointHeight) {
                // 存入
                BufferedImage image = imageInfo[i].getImage();
                ImageIO.write(image, "PNG", new File("D:\\123" + i + ".png"));
            }
        }

        bw.write(text);

        bw.flush();
        bw.close();
        fw.close();

    }

}

然后就可以了

这里带一点福利，java截图

import java.awt.Dimension;
import java.awt.Rectangle;
import java.awt.Robot;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import java.io.File;

import javax.imageio.ImageIO;

/**
 * 截屏
 */
public class CameraTest {

    private String filePreStr; // 默认前缀（选择存储路径例如： D：\\）
    private String defName = "cameraImg";  // 默认截图名称
    static int serialNum = 0;  //截图名称后面的数字累加
    private String imageFormat; // 图像文件的格式
    private String defaultImageFormat = "png"; //截图后缀
    Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); //获取全屏幕的宽高尺寸等数据

    public CameraTest() {
        filePreStr = defName;
        imageFormat = defaultImageFormat;
    }

    public CameraTest(String s, String format) {
        filePreStr = s;
        imageFormat = format;
    }

    public void snapShot() {
        try {
            // *** 核心代码 *** 拷贝屏幕到一个BufferedImage对象screenshot         
            BufferedImage screenshot = (new Robot()).createScreenCapture(new Rectangle(300, 600, 280, 400));
            serialNum++;
            // 根据文件前缀变量和文件格式变量，自动生成文件名
            String name = filePreStr + String.valueOf(serialNum) + "." + imageFormat;
            File f = new File(name);
            System.out.print("Save File " + name);
            // 将screenshot对象写入图像文件
            ImageIO.write(screenshot, imageFormat, f);
            System.out.print("..Finished!\n");
        } catch (Exception ex) {
            System.out.println(ex);
        }
    }
// 运行之后，即可将全屏幕截图保存到指定的目录下面<br>　　　　
// 配合前端页面上面的选择尺寸等逻辑，传到后台，即可实现*选择截图区域和大小的截图<br>
    public static void main(String[] args) {
        CameraTest cam = new CameraTest("d:\\Hello", "png");//
        cam.snapShot();
    }
}

java-PDF读取一页某一区域内容

首先还是我的风格，先看效果

在某一页pdf中获取某一块区域的内容和图片

上代码

一个完整的工具类

这里带一点福利，java截图

不用截图如何将Excel表格中某一区域的数据内容转成图片以便使用

PHP如何读取INI文件的某一节点的所有内容？

java-PDF读取一页某一区域内容

不用截图如何将Excel表格中某一区域的数据内容转成图片以便使用