itext,jxl实现pdf转为txt,txt转excel
程序员文章站
2022-04-11 10:21:28
...
itext,jxl实现pdf转为txt,txt转excel
pom.xml配置
<!--管理依赖的版本号-->
<properties>
<com.itextpdf.version>5.3.2</com.itextpdf.version>
<org.bouncycastle.version>1.52</org.bouncycastle.version>
<jxl.version>1.0</jxl.version>
</properties>
<!--依赖-->
<dependencies>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>${com.itextpdf.version}</version>
<scope>compile</scope>
</dependency>
<!--读取pdf-->
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcpg-jdk15on</artifactId>
<version>${org.bouncycastle.version}</version>
</dependency>
<!--text-->
<dependency>
<groupId>jxl</groupId>
<artifactId>jxl</artifactId>
<version>${jxl.version}</version>
</dependency>
</dependencies>
itext 读取pdf->txt
package itext;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
public class ReadPdfByiText {
public static void main(String[] args) throws IOException {
String outputPath = "D:\\developcodespace\\PdfContent_1.txt";
PrintWriter writer = new PrintWriter(new FileOutputStream(outputPath));
String fileName = "D:\\developcodespace\\20190323175137823782.pdf";
readPdf(writer, fileName);//直接读全PDF面
//readPdf_filter(fileName);//读取PDF面的某个区域
}
public static void readPdf(PrintWriter writer,String fileName){
String pageContent = "";
try {
PdfReader reader = new PdfReader(fileName);
int pageNum = reader.getNumberOfPages();
for(int i=1;i<=pageNum;i++){
String textFromPage = PdfTextExtractor.getTextFromPage(reader, i);
pageContent += textFromPage;//读取第i页的文档内容
// pageContent += PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容
}
writer.write(pageContent);
} catch (Exception e) {
e.printStackTrace();
}finally{
writer.close();
}
}
public static void readPdf_filter(PrintWriter writer,String fileName){
String pageContent = "";
try {
Rectangle rect = new Rectangle(90, 0, 450, 40);
RenderFilter filter = new RegionTextRenderFilter(rect);
PdfReader reader = new PdfReader(fileName);
int pageNum = reader.getNumberOfPages();
TextExtractionStrategy strategy;
for (int i = 1; i <= pageNum; i++) {
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
pageContent +=PdfTextExtractor.getTextFromPage(reader, i, strategy);
}
/*String[] split = pageContent.split(" ");
for(String ss : split){
System.out.println(ss.substring(ss.lastIndexOf(":")+1, ss.length()));
}*/
writer.write(pageContent);
} catch (Exception e) {
e.printStackTrace();
}finally{
writer.close();
}
}
}
jxl读取txt->excel
package itext;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class TextToExcel {
public static void main(String[] args) {
File file = new File("D:\\developcodespace\\PdfContent_1.txt");// 将读取的txt文件
File file2 = new File("D:\\developcodespace\\work.xls");// 将生成的excel表格
if (file.exists() && file.isFile()) {
InputStreamReader read = null;
String line = "";
BufferedReader input = null;
WritableWorkbook wbook = null;
WritableSheet sheet;
try {
read = new InputStreamReader(new FileInputStream(file), "utf-8");
input = new BufferedReader(read);
wbook = Workbook.createWorkbook(file2);// 根据路径生成excel文件
sheet = wbook.createSheet("first", 0);// 新标签页
try {
Label company = new Label(0, 0, "公司名称");// 如下皆为列名
sheet.addCell(company);
Label position = new Label(1, 0, "岗位");
sheet.addCell(position);
Label salary = new Label(2, 0, "薪资");
sheet.addCell(salary);
Label status = new Label(3, 0, "状态");
sheet.addCell(status);
} catch (RowsExceededException e) {
e.printStackTrace();
} catch (WriteException e) {
e.printStackTrace();
}
int m = 1;// excel行数
int n = 0;// excel列数
Label t;
while ((line = input.readLine()) != null) {
if(!line.startsWith("014")){
continue;
}
String[] words = line.split("[ \t]");// 把读出来的这行根据空格或tab分割开
for (int i = 0; i < words.length; i++) {
if (!words[i].matches("\\s*")) { // 当不是空行时
t = new Label(n, m, words[i].trim());
sheet.addCell(t);
n++;
}
}
n = 0;// 回到列头部
m++;// 向下移动一行
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (RowsExceededException e) {
e.printStackTrace();
} catch (WriteException e) {
e.printStackTrace();
} finally {
try {
wbook.write();
wbook.close();
input.close();
read.close();
} catch (IOException e) {
e.printStackTrace();
} catch (WriteException e) {
e.printStackTrace();
}
}
System.out.println("over!");
System.exit(0);
} else {
System.out.println("file is not exists or not a file");
System.exit(0);
}
}
}
上一篇: 小白不要买这两类电脑:老板不说 谨防上当
推荐阅读
-
Java实现Word/Pdf/TXT转html
-
Asp.net实现直接在浏览器预览Word、Excel、PDF、Txt文件(附源码)
-
Java实现Word/Excel/TXT转PDF
-
C# 基于NPOI+Office COM组件 实现20行代码在线预览文档(word,excel,pdf,txt,png)
-
Java实现Word/Pdf/TXT转html的示例
-
Python实现pdf文档转txt的方法示例
-
php可否实现服务器端txt文件转epub文件,word转epub文件,pdf转epub文件
-
Java实现Word/Pdf/TXT转html
-
Asp.net实现直接在浏览器预览Word、Excel、PDF、Txt文件(附源码)
-
Java实现Word/Excel/TXT转PDF