PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox

程序员文章站 2022-03-30 19:24:22

...

PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox

On Ubuntu
> sudo apt-get install pdftohtml

Or

> sudo apt-get install poppler-utils

Try the command
> pdftohtml homedepot.pdf homedepot.html -c -noframes

> pdftohtml -enc UTF-8 -noframes homedepot.pdf homedepot.html

> pdftohtml -enc UTF-8 -noframes -c homedepot.pdf homedepot.html

It looks ok, but I will try other packages.

Try this library
https://pdfbox.apache.org/2.0/getting-started.html
Command line
https://pdfbox.apache.org/2.0/commandline.html

Directly convert to image and html src the image.
package com.cloudsnap.connector.netsuite;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

public class PDFImageApp {

    public static void main(String[] args) {
        PdfToImage("/Users/hluo/data/homedepot.pdf");
    }

    public static void PdfToImage(String pdfurl) {
        StringBuffer buffer = new StringBuffer();
        FileOutputStream fos;
        PDDocument document;
        File pdfFile;
        int size;
        BufferedImage image;
        FileOutputStream out;
        Long randStr = 0l;
        // PDF转换成HTML保存的文件夹
        String path = "/Users/hluo/data/";
        File htmlsDir = new File(path);
        if (!htmlsDir.exists()) {
            htmlsDir.mkdirs();
        }
        File htmlDir = new File(path + "/");
        if (!htmlDir.exists()) {
            htmlDir.mkdirs();
        }
        try {
            // 遍历处理pdf附件
            randStr = System.currentTimeMillis();
            buffer.append("<!doctype html>\r\n");
            buffer.append("<head>\r\n");
            buffer.append("<meta charset=\"UTF-8\">\r\n");
            buffer.append("</head>\r\n");
            buffer.append("<body style=\"background-color:gray;\">\r\n");
            buffer.append("<style>\r\n");
            buffer.append(
                    "img {background-color:#fff; text-align:center; width:100%; max-width:100%;margin-top:6px;}\r\n");
            buffer.append("</style>\r\n");
            document = new PDDocument();
            // pdf附件
            pdfFile = new File(pdfurl);
            document = PDDocument.load(pdfFile, (String) null);
            size = document.getNumberOfPages();
            Long start = System.currentTimeMillis(), end = null;
            System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
            PDFRenderer reader = new PDFRenderer(document);
            for (int i = 0; i < size; i++) {
                // image = new PDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
                image = reader.renderImage(i, 1.5f);
                // 生成图片,保存位置
                out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
                ImageIO.write(image, "png", out); // 使用png的清晰度
                // 将图片路径追加到网页文件里
                buffer.append("<img src=\"" + path + "/" + "image" + "_" + i + ".jpg\"/>\r\n");
                image = null;
                out.flush();
                out.close();
            }
            reader = null;
            document.close();
            buffer.append("</body>\r\n");
            buffer.append("</html>");
            end = System.currentTimeMillis() - start;
            System.out.println("===> Reading pdf times: " + (end / 1000));
            start = end = null;
            // 生成网页文件
            fos = new FileOutputStream(path + randStr + ".html");
            System.out.println(path + randStr + ".html");
            fos.write(buffer.toString().getBytes());
            fos.flush();
            fos.close();
            buffer.setLength(0);

        } catch (Exception e) {
            System.out.println("===>Reader parse pdf to jpg error : " + e.getMessage());
            e.printStackTrace();
        }
    }

}

Get the Images and Text Separately
package com.cloudsnap.connector.netsuite;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import javax.imageio.ImageIO;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDFApp {

    /**
     * 获取格式化后的时间信息
     *
     * @param calendar 时间信息
     * @return
     */
    public static String dateFormat(Calendar calendar) {
        if (null == calendar)
            return null;
        String date = null;
        String pattern = "yyyy-MM-dd HH:mm:ss";
        SimpleDateFormat format = new SimpleDateFormat(pattern);
        date = format.format(calendar.getTime());
        return date == null ? "" : date;
    }

    /** 打印纲要 **/
    public static void getPDFOutline(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            // 获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            // 获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline = catalog.getDocumentOutline();
            // 获取第一个纲要条目（标题1）
            PDOutlineItem item = outline.getFirstChild();
            if (outline != null) {
                // 遍历每一个标题1
                while (item != null) {
                    // 打印标题1的文本
                    System.out.println("Item:" + item.getTitle());
                    // 获取标题1下的第一个子标题（标题2）
                    PDOutlineItem child = item.getFirstChild();
                    // 遍历每一个标题2
                    while (child != null) {
                        // 打印标题2的文本
                        System.out.println("    Child:" + child.getTitle());
                        // 指向下一个标题2
                        child = child.getNextSibling();
                    }
                    // 指向下一个标题1
                    item = item.getNextSibling();
                }
            }
            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /** 打印一级目录 **/
    public static void getPDFCatalog(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            // 获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            // 获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline = catalog.getDocumentOutline();
            // 获取第一个纲要条目（标题1）
            if (outline != null) {
                PDOutlineItem item = outline.getFirstChild();
                // 遍历每一个标题1
                while (item != null) {
                    // 打印标题1的文本
                    System.out.println("Item:" + item.getTitle());
                    // 指向下一个标题1
                    item = item.getNextSibling();
                }
            }
            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /** 获取PDF文档元数据 **/
    public static void getPDFInformation(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            /** 文档属性信息 **/
            PDDocumentInformation info = document.getDocumentInformation();

            System.out.println("页数:" + document.getNumberOfPages());

            System.out.println("标题:" + info.getTitle());
            System.out.println("主题:" + info.getSubject());
            System.out.println("作者:" + info.getAuthor());
            System.out.println("关键字:" + info.getKeywords());

            System.out.println("应用程序:" + info.getCreator());
            System.out.println("pdf 制作程序:" + info.getProducer());

            System.out.println("Trapped:" + info.getTrapped());

            System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
            System.out.println("修改时间:" + dateFormat(info.getModificationDate()));

            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /**
     * 提取部分PDF文本转HTML
     *
     * @param file      pdf文档路径
     * @param startPage 开始页数
     * @param endPage   结束页数
     * @param htmlFile 保存文件路径
     */
    public void getText(String pdfFile, int startPage, int endPage, String htmlFile) throws Exception {

        // 是否排序
        boolean sort = true;
        // hpd文件路径
        // String pdfFile = "D:\\PDF\\" + file;
        // 编码方式
        String encoding = "UTF-8";
        // 开始提取页数
        // int startPage = 1;
        // 结束提取页数,最大
        // int endPage = Integer.MAX_VALUE;

        // 文件输入流，生成文本文件
        Writer output = null;
        // 内存中存储的PDF Document
        PDDocument document = null;

        String result = null;
        try {
            PDDocument doc = PDDocument.load(new File(pdfFile));
            result = new PDFTextStripper().getText(doc);
            // 文件输入流，写入HTML文件
            output = new OutputStreamWriter(new FileOutputStream(htmlFile), encoding);

            output.write(
                    "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"> \r\n");
            output.write("<html> \r\n");
            output.write("<head> \r\n");
            output.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"> \r\n");
            output.write("</head> \r\n");
            output.write("<body> \r\n");
            output.write("<center>");
            output.write(result);
            // 调用PDFTextStripper的writeText提取并输出文本
            // stripper.writeText(document, output);
            output.write("</center>");
            output.write("</body> \r\n");
            output.write("</html> \r\n");
        } finally {
            if (output != null) {
                // 关闭输出流
                output.close();
            }
            if (document != null) {
                // 关闭PDF Document
                document.close();
            }
        }
    }

    /**
     * 提取PDF图片并保存
     *
     * @param file        PDF文档路径
     * @param imgSavePath 图片保存路径
     * @throws IOException
     */
    public void getImage(String pdfFile, String imgSavePath) throws IOException {
        PDDocument document = PDDocument.load(new File(pdfFile));
        PDPageTree list = document.getPages();
        for (PDPage page : list) {
            PDResources pdResources = page.getResources();
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    File file = new File(imgSavePath + System.nanoTime() + ".png");
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png",
                            file);
                }
            }
        }
    }

    public static void main(String[] args) {
        PDFApp pth = new PDFApp();
        String pdfFile = "/Users/hluo/data/homedepot.pdf";
        long startTime = System.currentTimeMillis();
        try {
            pth.getText(pdfFile, 1, Integer.MAX_VALUE, "/Users/hluo/data/homedepot.html");
            pth.getImage(pdfFile, "/Users/hluo/data/");
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        long endTime = System.currentTimeMillis();
        System.out.println("读写所用时间为：" + (endTime - startTime) + "ms    " + pdfFile + "转换完成");
    }

}

References:
https://*.com/questions/8370014/how-to-convert-pdf-to-html
https://blog.csdn.net/qq_34719291/article/details/75589024
https://blog.csdn.net/Warren_one/article/details/78625546
https://*.com/questions/23813727/how-to-extract-text-from-a-pdf-file-with-apache-pdfbox

上一篇： iOS 14.3/iPadOS 14.3正式版值得升级吗?iOS 14.3/iPadOS 14.3正式版更新

下一篇： Spark(4)Deal with Mesos