PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox
程序员文章站
2022-03-30 19:24:22
...
PDF to HTML 2020(1)pdftohtml Linux tool or PDFBox
On Ubuntu
> sudo apt-get install pdftohtml
Or
> sudo apt-get install poppler-utils
Try the command
> pdftohtml homedepot.pdf homedepot.html -c -noframes
> pdftohtml -enc UTF-8 -noframes homedepot.pdf homedepot.html
> pdftohtml -enc UTF-8 -noframes -c homedepot.pdf homedepot.html
It looks ok, but I will try other packages.
Try this library
https://pdfbox.apache.org/2.0/getting-started.html
Command line
https://pdfbox.apache.org/2.0/commandline.html
Directly convert to image and html src the image.
package com.cloudsnap.connector.netsuite;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
public class PDFImageApp {
public static void main(String[] args) {
PdfToImage("/Users/hluo/data/homedepot.pdf");
}
public static void PdfToImage(String pdfurl) {
StringBuffer buffer = new StringBuffer();
FileOutputStream fos;
PDDocument document;
File pdfFile;
int size;
BufferedImage image;
FileOutputStream out;
Long randStr = 0l;
// PDF转换成HTML保存的文件夹
String path = "/Users/hluo/data/";
File htmlsDir = new File(path);
if (!htmlsDir.exists()) {
htmlsDir.mkdirs();
}
File htmlDir = new File(path + "/");
if (!htmlDir.exists()) {
htmlDir.mkdirs();
}
try {
// 遍历处理pdf附件
randStr = System.currentTimeMillis();
buffer.append("<!doctype html>\r\n");
buffer.append("<head>\r\n");
buffer.append("<meta charset=\"UTF-8\">\r\n");
buffer.append("</head>\r\n");
buffer.append("<body style=\"background-color:gray;\">\r\n");
buffer.append("<style>\r\n");
buffer.append(
"img {background-color:#fff; text-align:center; width:100%; max-width:100%;margin-top:6px;}\r\n");
buffer.append("</style>\r\n");
document = new PDDocument();
// pdf附件
pdfFile = new File(pdfurl);
document = PDDocument.load(pdfFile, (String) null);
size = document.getNumberOfPages();
Long start = System.currentTimeMillis(), end = null;
System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
PDFRenderer reader = new PDFRenderer(document);
for (int i = 0; i < size; i++) {
// image = new PDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
image = reader.renderImage(i, 1.5f);
// 生成图片,保存位置
out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
ImageIO.write(image, "png", out); // 使用png的清晰度
// 将图片路径追加到网页文件里
buffer.append("<img src=\"" + path + "/" + "image" + "_" + i + ".jpg\"/>\r\n");
image = null;
out.flush();
out.close();
}
reader = null;
document.close();
buffer.append("</body>\r\n");
buffer.append("</html>");
end = System.currentTimeMillis() - start;
System.out.println("===> Reading pdf times: " + (end / 1000));
start = end = null;
// 生成网页文件
fos = new FileOutputStream(path + randStr + ".html");
System.out.println(path + randStr + ".html");
fos.write(buffer.toString().getBytes());
fos.flush();
fos.close();
buffer.setLength(0);
} catch (Exception e) {
System.out.println("===>Reader parse pdf to jpg error : " + e.getMessage());
e.printStackTrace();
}
}
}
Get the Images and Text Separately
package com.cloudsnap.connector.netsuite;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import javax.imageio.ImageIO;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFApp {
/**
* 获取格式化后的时间信息
*
* @param calendar 时间信息
* @return
*/
public static String dateFormat(Calendar calendar) {
if (null == calendar)
return null;
String date = null;
String pattern = "yyyy-MM-dd HH:mm:ss";
SimpleDateFormat format = new SimpleDateFormat(pattern);
date = format.format(calendar.getTime());
return date == null ? "" : date;
}
/** 打印纲要 **/
public static void getPDFOutline(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline = catalog.getDocumentOutline();
// 获取第一个纲要条目(标题1)
PDOutlineItem item = outline.getFirstChild();
if (outline != null) {
// 遍历每一个标题1
while (item != null) {
// 打印标题1的文本
System.out.println("Item:" + item.getTitle());
// 获取标题1下的第一个子标题(标题2)
PDOutlineItem child = item.getFirstChild();
// 遍历每一个标题2
while (child != null) {
// 打印标题2的文本
System.out.println(" Child:" + child.getTitle());
// 指向下一个标题2
child = child.getNextSibling();
}
// 指向下一个标题1
item = item.getNextSibling();
}
}
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/** 打印一级目录 **/
public static void getPDFCatalog(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline = catalog.getDocumentOutline();
// 获取第一个纲要条目(标题1)
if (outline != null) {
PDOutlineItem item = outline.getFirstChild();
// 遍历每一个标题1
while (item != null) {
// 打印标题1的文本
System.out.println("Item:" + item.getTitle());
// 指向下一个标题1
item = item.getNextSibling();
}
}
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/** 获取PDF文档元数据 **/
public static void getPDFInformation(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
/** 文档属性信息 **/
PDDocumentInformation info = document.getDocumentInformation();
System.out.println("页数:" + document.getNumberOfPages());
System.out.println("标题:" + info.getTitle());
System.out.println("主题:" + info.getSubject());
System.out.println("作者:" + info.getAuthor());
System.out.println("关键字:" + info.getKeywords());
System.out.println("应用程序:" + info.getCreator());
System.out.println("pdf 制作程序:" + info.getProducer());
System.out.println("Trapped:" + info.getTrapped());
System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
System.out.println("修改时间:" + dateFormat(info.getModificationDate()));
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 提取部分PDF文本转HTML
*
* @param file pdf文档路径
* @param startPage 开始页数
* @param endPage 结束页数
* @param htmlFile 保存文件路径
*/
public void getText(String pdfFile, int startPage, int endPage, String htmlFile) throws Exception {
// 是否排序
boolean sort = true;
// hpd文件路径
// String pdfFile = "D:\\PDF\\" + file;
// 编码方式
String encoding = "UTF-8";
// 开始提取页数
// int startPage = 1;
// 结束提取页数,最大
// int endPage = Integer.MAX_VALUE;
// 文件输入流,生成文本文件
Writer output = null;
// 内存中存储的PDF Document
PDDocument document = null;
String result = null;
try {
PDDocument doc = PDDocument.load(new File(pdfFile));
result = new PDFTextStripper().getText(doc);
// 文件输入流,写入HTML文件
output = new OutputStreamWriter(new FileOutputStream(htmlFile), encoding);
output.write(
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"> \r\n");
output.write("<html> \r\n");
output.write("<head> \r\n");
output.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"> \r\n");
output.write("</head> \r\n");
output.write("<body> \r\n");
output.write("<center>");
output.write(result);
// 调用PDFTextStripper的writeText提取并输出文本
// stripper.writeText(document, output);
output.write("</center>");
output.write("</body> \r\n");
output.write("</html> \r\n");
} finally {
if (output != null) {
// 关闭输出流
output.close();
}
if (document != null) {
// 关闭PDF Document
document.close();
}
}
}
/**
* 提取PDF图片并保存
*
* @param file PDF文档路径
* @param imgSavePath 图片保存路径
* @throws IOException
*/
public void getImage(String pdfFile, String imgSavePath) throws IOException {
PDDocument document = PDDocument.load(new File(pdfFile));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File(imgSavePath + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png",
file);
}
}
}
}
public static void main(String[] args) {
PDFApp pth = new PDFApp();
String pdfFile = "/Users/hluo/data/homedepot.pdf";
long startTime = System.currentTimeMillis();
try {
pth.getText(pdfFile, 1, Integer.MAX_VALUE, "/Users/hluo/data/homedepot.html");
pth.getImage(pdfFile, "/Users/hluo/data/");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
System.out.println("读写所用时间为:" + (endTime - startTime) + "ms " + pdfFile + "转换完成");
}
}
References:
https://*.com/questions/8370014/how-to-convert-pdf-to-html
https://blog.csdn.net/qq_34719291/article/details/75589024
https://blog.csdn.net/Warren_one/article/details/78625546
https://*.com/questions/23813727/how-to-extract-text-from-a-pdf-file-with-apache-pdfbox
On Ubuntu
> sudo apt-get install pdftohtml
Or
> sudo apt-get install poppler-utils
Try the command
> pdftohtml homedepot.pdf homedepot.html -c -noframes
> pdftohtml -enc UTF-8 -noframes homedepot.pdf homedepot.html
> pdftohtml -enc UTF-8 -noframes -c homedepot.pdf homedepot.html
It looks ok, but I will try other packages.
Try this library
https://pdfbox.apache.org/2.0/getting-started.html
Command line
https://pdfbox.apache.org/2.0/commandline.html
Directly convert to image and html src the image.
package com.cloudsnap.connector.netsuite;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
public class PDFImageApp {
public static void main(String[] args) {
PdfToImage("/Users/hluo/data/homedepot.pdf");
}
public static void PdfToImage(String pdfurl) {
StringBuffer buffer = new StringBuffer();
FileOutputStream fos;
PDDocument document;
File pdfFile;
int size;
BufferedImage image;
FileOutputStream out;
Long randStr = 0l;
// PDF转换成HTML保存的文件夹
String path = "/Users/hluo/data/";
File htmlsDir = new File(path);
if (!htmlsDir.exists()) {
htmlsDir.mkdirs();
}
File htmlDir = new File(path + "/");
if (!htmlDir.exists()) {
htmlDir.mkdirs();
}
try {
// 遍历处理pdf附件
randStr = System.currentTimeMillis();
buffer.append("<!doctype html>\r\n");
buffer.append("<head>\r\n");
buffer.append("<meta charset=\"UTF-8\">\r\n");
buffer.append("</head>\r\n");
buffer.append("<body style=\"background-color:gray;\">\r\n");
buffer.append("<style>\r\n");
buffer.append(
"img {background-color:#fff; text-align:center; width:100%; max-width:100%;margin-top:6px;}\r\n");
buffer.append("</style>\r\n");
document = new PDDocument();
// pdf附件
pdfFile = new File(pdfurl);
document = PDDocument.load(pdfFile, (String) null);
size = document.getNumberOfPages();
Long start = System.currentTimeMillis(), end = null;
System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
PDFRenderer reader = new PDFRenderer(document);
for (int i = 0; i < size; i++) {
// image = new PDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
image = reader.renderImage(i, 1.5f);
// 生成图片,保存位置
out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
ImageIO.write(image, "png", out); // 使用png的清晰度
// 将图片路径追加到网页文件里
buffer.append("<img src=\"" + path + "/" + "image" + "_" + i + ".jpg\"/>\r\n");
image = null;
out.flush();
out.close();
}
reader = null;
document.close();
buffer.append("</body>\r\n");
buffer.append("</html>");
end = System.currentTimeMillis() - start;
System.out.println("===> Reading pdf times: " + (end / 1000));
start = end = null;
// 生成网页文件
fos = new FileOutputStream(path + randStr + ".html");
System.out.println(path + randStr + ".html");
fos.write(buffer.toString().getBytes());
fos.flush();
fos.close();
buffer.setLength(0);
} catch (Exception e) {
System.out.println("===>Reader parse pdf to jpg error : " + e.getMessage());
e.printStackTrace();
}
}
}
Get the Images and Text Separately
package com.cloudsnap.connector.netsuite;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import javax.imageio.ImageIO;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFApp {
/**
* 获取格式化后的时间信息
*
* @param calendar 时间信息
* @return
*/
public static String dateFormat(Calendar calendar) {
if (null == calendar)
return null;
String date = null;
String pattern = "yyyy-MM-dd HH:mm:ss";
SimpleDateFormat format = new SimpleDateFormat(pattern);
date = format.format(calendar.getTime());
return date == null ? "" : date;
}
/** 打印纲要 **/
public static void getPDFOutline(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline = catalog.getDocumentOutline();
// 获取第一个纲要条目(标题1)
PDOutlineItem item = outline.getFirstChild();
if (outline != null) {
// 遍历每一个标题1
while (item != null) {
// 打印标题1的文本
System.out.println("Item:" + item.getTitle());
// 获取标题1下的第一个子标题(标题2)
PDOutlineItem child = item.getFirstChild();
// 遍历每一个标题2
while (child != null) {
// 打印标题2的文本
System.out.println(" Child:" + child.getTitle());
// 指向下一个标题2
child = child.getNextSibling();
}
// 指向下一个标题1
item = item.getNextSibling();
}
}
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/** 打印一级目录 **/
public static void getPDFCatalog(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取PDDocumentOutline文档纲要对象
PDDocumentOutline outline = catalog.getDocumentOutline();
// 获取第一个纲要条目(标题1)
if (outline != null) {
PDOutlineItem item = outline.getFirstChild();
// 遍历每一个标题1
while (item != null) {
// 打印标题1的文本
System.out.println("Item:" + item.getTitle());
// 指向下一个标题1
item = item.getNextSibling();
}
}
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/** 获取PDF文档元数据 **/
public static void getPDFInformation(String file) {
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
/** 文档属性信息 **/
PDDocumentInformation info = document.getDocumentInformation();
System.out.println("页数:" + document.getNumberOfPages());
System.out.println("标题:" + info.getTitle());
System.out.println("主题:" + info.getSubject());
System.out.println("作者:" + info.getAuthor());
System.out.println("关键字:" + info.getKeywords());
System.out.println("应用程序:" + info.getCreator());
System.out.println("pdf 制作程序:" + info.getProducer());
System.out.println("Trapped:" + info.getTrapped());
System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
System.out.println("修改时间:" + dateFormat(info.getModificationDate()));
// 关闭输入流
document.close();
fis.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 提取部分PDF文本转HTML
*
* @param file pdf文档路径
* @param startPage 开始页数
* @param endPage 结束页数
* @param htmlFile 保存文件路径
*/
public void getText(String pdfFile, int startPage, int endPage, String htmlFile) throws Exception {
// 是否排序
boolean sort = true;
// hpd文件路径
// String pdfFile = "D:\\PDF\\" + file;
// 编码方式
String encoding = "UTF-8";
// 开始提取页数
// int startPage = 1;
// 结束提取页数,最大
// int endPage = Integer.MAX_VALUE;
// 文件输入流,生成文本文件
Writer output = null;
// 内存中存储的PDF Document
PDDocument document = null;
String result = null;
try {
PDDocument doc = PDDocument.load(new File(pdfFile));
result = new PDFTextStripper().getText(doc);
// 文件输入流,写入HTML文件
output = new OutputStreamWriter(new FileOutputStream(htmlFile), encoding);
output.write(
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"> \r\n");
output.write("<html> \r\n");
output.write("<head> \r\n");
output.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"> \r\n");
output.write("</head> \r\n");
output.write("<body> \r\n");
output.write("<center>");
output.write(result);
// 调用PDFTextStripper的writeText提取并输出文本
// stripper.writeText(document, output);
output.write("</center>");
output.write("</body> \r\n");
output.write("</html> \r\n");
} finally {
if (output != null) {
// 关闭输出流
output.close();
}
if (document != null) {
// 关闭PDF Document
document.close();
}
}
}
/**
* 提取PDF图片并保存
*
* @param file PDF文档路径
* @param imgSavePath 图片保存路径
* @throws IOException
*/
public void getImage(String pdfFile, String imgSavePath) throws IOException {
PDDocument document = PDDocument.load(new File(pdfFile));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File(imgSavePath + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png",
file);
}
}
}
}
public static void main(String[] args) {
PDFApp pth = new PDFApp();
String pdfFile = "/Users/hluo/data/homedepot.pdf";
long startTime = System.currentTimeMillis();
try {
pth.getText(pdfFile, 1, Integer.MAX_VALUE, "/Users/hluo/data/homedepot.html");
pth.getImage(pdfFile, "/Users/hluo/data/");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
System.out.println("读写所用时间为:" + (endTime - startTime) + "ms " + pdfFile + "转换完成");
}
}
References:
https://*.com/questions/8370014/how-to-convert-pdf-to-html
https://blog.csdn.net/qq_34719291/article/details/75589024
https://blog.csdn.net/Warren_one/article/details/78625546
https://*.com/questions/23813727/how-to-extract-text-from-a-pdf-file-with-apache-pdfbox