欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox

程序员文章站 2022-03-30 19:24:22
...
PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox

On Ubuntu
> sudo apt-get install pdftohtml

Or

> sudo apt-get install poppler-utils

Try the command
> pdftohtml homedepot.pdf homedepot.html -c -noframes

> pdftohtml -enc UTF-8 -noframes homedepot.pdf homedepot.html

> pdftohtml -enc UTF-8 -noframes -c homedepot.pdf homedepot.html

It looks ok, but I will try other packages.

Try this library
https://pdfbox.apache.org/2.0/getting-started.html
Command line
https://pdfbox.apache.org/2.0/commandline.html

Directly convert to image and html src the image.
package com.cloudsnap.connector.netsuite;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

public class PDFImageApp {

    public static void main(String[] args) {
        PdfToImage("/Users/hluo/data/homedepot.pdf");
    }

    public static void PdfToImage(String pdfurl) {
        StringBuffer buffer = new StringBuffer();
        FileOutputStream fos;
        PDDocument document;
        File pdfFile;
        int size;
        BufferedImage image;
        FileOutputStream out;
        Long randStr = 0l;
        // PDF转换成HTML保存的文件夹
        String path = "/Users/hluo/data/";
        File htmlsDir = new File(path);
        if (!htmlsDir.exists()) {
            htmlsDir.mkdirs();
        }
        File htmlDir = new File(path + "/");
        if (!htmlDir.exists()) {
            htmlDir.mkdirs();
        }
        try {
            // 遍历处理pdf附件
            randStr = System.currentTimeMillis();
            buffer.append("<!doctype html>\r\n");
            buffer.append("<head>\r\n");
            buffer.append("<meta charset=\"UTF-8\">\r\n");
            buffer.append("</head>\r\n");
            buffer.append("<body style=\"background-color:gray;\">\r\n");
            buffer.append("<style>\r\n");
            buffer.append(
                    "img {background-color:#fff; text-align:center; width:100%; max-width:100%;margin-top:6px;}\r\n");
            buffer.append("</style>\r\n");
            document = new PDDocument();
            // pdf附件
            pdfFile = new File(pdfurl);
            document = PDDocument.load(pdfFile, (String) null);
            size = document.getNumberOfPages();
            Long start = System.currentTimeMillis(), end = null;
            System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
            PDFRenderer reader = new PDFRenderer(document);
            for (int i = 0; i < size; i++) {
                // image = new PDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
                image = reader.renderImage(i, 1.5f);
                // 生成图片,保存位置
                out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
                ImageIO.write(image, "png", out); // 使用png的清晰度
                // 将图片路径追加到网页文件里
                buffer.append("<img src=\"" + path + "/" + "image" + "_" + i + ".jpg\"/>\r\n");
                image = null;
                out.flush();
                out.close();
            }
            reader = null;
            document.close();
            buffer.append("</body>\r\n");
            buffer.append("</html>");
            end = System.currentTimeMillis() - start;
            System.out.println("===> Reading pdf times: " + (end / 1000));
            start = end = null;
            // 生成网页文件
            fos = new FileOutputStream(path + randStr + ".html");
            System.out.println(path + randStr + ".html");
            fos.write(buffer.toString().getBytes());
            fos.flush();
            fos.close();
            buffer.setLength(0);

        } catch (Exception e) {
            System.out.println("===>Reader parse pdf to jpg error : " + e.getMessage());
            e.printStackTrace();
        }
    }

}

Get the Images and Text Separately
package com.cloudsnap.connector.netsuite;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import javax.imageio.ImageIO;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDFApp {

    /**
     * 获取格式化后的时间信息
     *
     * @param calendar 时间信息
     * @return
     */
    public static String dateFormat(Calendar calendar) {
        if (null == calendar)
            return null;
        String date = null;
        String pattern = "yyyy-MM-dd HH:mm:ss";
        SimpleDateFormat format = new SimpleDateFormat(pattern);
        date = format.format(calendar.getTime());
        return date == null ? "" : date;
    }

    /** 打印纲要 **/
    public static void getPDFOutline(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            // 获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            // 获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline = catalog.getDocumentOutline();
            // 获取第一个纲要条目(标题1)
            PDOutlineItem item = outline.getFirstChild();
            if (outline != null) {
                // 遍历每一个标题1
                while (item != null) {
                    // 打印标题1的文本
                    System.out.println("Item:" + item.getTitle());
                    // 获取标题1下的第一个子标题(标题2)
                    PDOutlineItem child = item.getFirstChild();
                    // 遍历每一个标题2
                    while (child != null) {
                        // 打印标题2的文本
                        System.out.println("    Child:" + child.getTitle());
                        // 指向下一个标题2
                        child = child.getNextSibling();
                    }
                    // 指向下一个标题1
                    item = item.getNextSibling();
                }
            }
            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /** 打印一级目录 **/
    public static void getPDFCatalog(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            // 获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            // 获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline = catalog.getDocumentOutline();
            // 获取第一个纲要条目(标题1)
            if (outline != null) {
                PDOutlineItem item = outline.getFirstChild();
                // 遍历每一个标题1
                while (item != null) {
                    // 打印标题1的文本
                    System.out.println("Item:" + item.getTitle());
                    // 指向下一个标题1
                    item = item.getNextSibling();
                }
            }
            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /** 获取PDF文档元数据 **/
    public static void getPDFInformation(String file) {
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            /** 文档属性信息 **/
            PDDocumentInformation info = document.getDocumentInformation();

            System.out.println("页数:" + document.getNumberOfPages());

            System.out.println("标题:" + info.getTitle());
            System.out.println("主题:" + info.getSubject());
            System.out.println("作者:" + info.getAuthor());
            System.out.println("关键字:" + info.getKeywords());

            System.out.println("应用程序:" + info.getCreator());
            System.out.println("pdf 制作程序:" + info.getProducer());

            System.out.println("Trapped:" + info.getTrapped());

            System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
            System.out.println("修改时间:" + dateFormat(info.getModificationDate()));

            // 关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    /**
     * 提取部分PDF文本转HTML
     *
     * @param file      pdf文档路径
     * @param startPage 开始页数
     * @param endPage   结束页数
     * @param htmlFile  保存文件路径
     */
    public void getText(String pdfFile, int startPage, int endPage, String htmlFile) throws Exception {

        // 是否排序
        boolean sort = true;
        // hpd文件路径
        // String pdfFile = "D:\\PDF\\" + file;
        // 编码方式
        String encoding = "UTF-8";
        // 开始提取页数
        // int startPage = 1;
        // 结束提取页数,最大
        // int endPage = Integer.MAX_VALUE;

        // 文件输入流,生成文本文件
        Writer output = null;
        // 内存中存储的PDF Document
        PDDocument document = null;

        String result = null;
        try {
            PDDocument doc = PDDocument.load(new File(pdfFile));
            result = new PDFTextStripper().getText(doc);
            // 文件输入流,写入HTML文件
            output = new OutputStreamWriter(new FileOutputStream(htmlFile), encoding);

            output.write(
                    "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"> \r\n");
            output.write("<html> \r\n");
            output.write("<head> \r\n");
            output.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"> \r\n");
            output.write("</head> \r\n");
            output.write("<body> \r\n");
            output.write("<center>");
            output.write(result);
            // 调用PDFTextStripper的writeText提取并输出文本
            // stripper.writeText(document, output);
            output.write("</center>");
            output.write("</body> \r\n");
            output.write("</html> \r\n");
        } finally {
            if (output != null) {
                // 关闭输出流
                output.close();
            }
            if (document != null) {
                // 关闭PDF Document
                document.close();
            }
        }
    }

    /**
     * 提取PDF图片并保存
     *
     * @param file        PDF文档路径
     * @param imgSavePath 图片保存路径
     * @throws IOException
     */
    public void getImage(String pdfFile, String imgSavePath) throws IOException {
        PDDocument document = PDDocument.load(new File(pdfFile));
        PDPageTree list = document.getPages();
        for (PDPage page : list) {
            PDResources pdResources = page.getResources();
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    File file = new File(imgSavePath + System.nanoTime() + ".png");
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png",
                            file);
                }
            }
        }
    }

    public static void main(String[] args) {
        PDFApp pth = new PDFApp();
        String pdfFile = "/Users/hluo/data/homedepot.pdf";
        long startTime = System.currentTimeMillis();
        try {
            pth.getText(pdfFile, 1, Integer.MAX_VALUE, "/Users/hluo/data/homedepot.html");
            pth.getImage(pdfFile, "/Users/hluo/data/");
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        long endTime = System.currentTimeMillis();
        System.out.println("读写所用时间为:" + (endTime - startTime) + "ms    " + pdfFile + "转换完成");
    }

}




References:
https://*.com/questions/8370014/how-to-convert-pdf-to-html
https://blog.csdn.net/qq_34719291/article/details/75589024
https://blog.csdn.net/Warren_one/article/details/78625546
https://*.com/questions/23813727/how-to-extract-text-from-a-pdf-file-with-apache-pdfbox