Java实现Word/Pdf/TXT转html

程序员文章站 2023-09-29 08:01:10

引言: 最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人 ......

引言:

最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成html文件,以便在网页上能够浏览学习

下边主要针对word,pdf和txt文本文件进行转换

一:java实现将word转换为html

1:引入依赖

 1 <dependency>
 2   <groupid>fr.opensagres.xdocreport</groupid>
 3   <artifactid>fr.opensagres.xdocreport.document</artifactid>
 4   <version>1.0.5</version>
 5 </dependency>
 6 <dependency> 
 7   <groupid>fr.opensagres.xdocreport</groupid> 
 8   <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid> 
 9   <version>1.0.5</version> 
10 </dependency>
11   <dependency>
12   <groupid>org.apache.poi</groupid>
13   <artifactid>poi</artifactid>
14   <version>3.12</version>
15 </dependency>
16 <dependency>
17   <groupid>org.apache.poi</groupid>
18   <artifactid>poi-scratchpad</artifactid>
19   <version>3.12</version>
20 </dependency>

2:代码demo

  1 package com.svse.controller;
  2 
  3 import javax.xml.parsers.documentbuilderfactory;
  4 import javax.xml.parsers.parserconfigurationexception;
  5 import javax.xml.transform.outputkeys;
  6 import javax.xml.transform.transformer;
  7 import javax.xml.transform.transformerexception;
  8 import javax.xml.transform.transformerfactory;
  9 import javax.xml.transform.dom.domsource;
 10 import javax.xml.transform.stream.streamresult;
 11 
 12 import org.apache.poi.hwpf.hwpfdocument;
 13 import org.apache.poi.hwpf.converter.picturesmanager;
 14 import org.apache.poi.hwpf.converter.wordtohtmlconverter;
 15 import org.apache.poi.hwpf.usermodel.picturetype;
 16 import org.apache.poi.xwpf.converter.core.basicuriresolver;
 17 import org.apache.poi.xwpf.converter.core.fileimageextractor;
 18 import org.apache.poi.xwpf.converter.core.fileuriresolver;
 19 import org.apache.poi.xwpf.converter.core.iuriresolver;
 20 import org.apache.poi.xwpf.converter.core.ixwpfconverter;
 21 import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter;
 22 import org.apache.poi.xwpf.converter.xhtml.xhtmloptions;
 23 import org.apache.poi.xwpf.usermodel.xwpfdocument;
 24 /**
 25  * word 转换成html
 26  */
 27 public class testwordtohtml {
 28 
 29     public static  final string storagepath="c://works//files//";
 30     public static  final string ip="192.168.30.222";
 31     public static  final string port="8010";
 32     public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception {
 33         testwordtohtml wt=new testwordtohtml();
 34         //wt.word2003tohtml("甲骨文考证.doc");
 35         wt.word2007tohtml("甲骨文考证.docx");
 36 
 37     }
 38       
 39      /**
 40      * 2003版本word转换成html
 41      * @throws ioexception
 42      * @throws transformerexception
 43      * @throws parserconfigurationexception
 44      */
 45     public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception {
 46        
 47         final string imagepath = storagepath+"fileimage/";//解析时候如果doc文件中有图片  图片会保存在此路径
 48         final string strranstring=getrandomnum();
 49         string filepath =storagepath;
 50         string htmlname =filename.substring(0, filename.indexof("."))+ "2003.html";
 51         final string file = filepath + filename;
 52         inputstream input = new fileinputstream(new file(file));
 53         hwpfdocument worddocument = new hwpfdocument(input);
 54         wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
 55         //设置图片存放的位置
 56         wordtohtmlconverter.setpicturesmanager(new picturesmanager() {
 57             public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) {
 58                 file imgpath = new file(imagepath);
 59                 if(!imgpath.exists()){//图片目录不存在则创建
 60                     imgpath.mkdirs();
 61                 }
 62                 
 63                 file file = new file(imagepath +strranstring+suggestedname);
 64                 try {
 65                     outputstream os = new fileoutputstream(file);
 66                     os.write(content);
 67                     os.close();
 68                 } catch (filenotfoundexception e) {
 69                     e.printstacktrace();
 70                 } catch (ioexception e) {
 71                     e.printstacktrace();
 72                 }
 73                 
 74                 return  "http://"+ip+":"+port+"//uploadfile/fileimage/"+strranstring+suggestedname;
 75                // return imagepath +strranstring+suggestedname;
 76             }
 77         });
 78         
 79         //解析word文档
 80         wordtohtmlconverter.processdocument(worddocument);
 81         document htmldocument = wordtohtmlconverter.getdocument();
 82         
 83         file htmlfile = new file(filepath +strranstring+htmlname);
 84         outputstream outstream = new fileoutputstream(htmlfile);
 85         
 86 
 87         domsource domsource = new domsource(htmldocument);
 88         streamresult streamresult = new streamresult(outstream);
 89 
 90         transformerfactory factory = transformerfactory.newinstance();
 91         transformer serializer = factory.newtransformer();
 92         serializer.setoutputproperty(outputkeys.encoding, "utf-8");
 93         serializer.setoutputproperty(outputkeys.indent, "yes");
 94         serializer.setoutputproperty(outputkeys.method, "html");
 95         
 96         serializer.transform(domsource, streamresult);
 97         outstream.close();
 98         
 99         system.out.println("生成html文件路径:"+ "http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
100     }
101 
102     /**
103      * 2007版本word转换成html
104      * @throws ioexception
105      */
106     public void word2007tohtml(string filename) throws ioexception {
107         
108        final string strranstring=getrandomnum();
109         
110         string filepath = storagepath+strranstring;
111         string htmlname =filename.substring(0, filename.indexof("."))+ "2007.html";
112         file f = new file(storagepath+filename);  
113         if (!f.exists()) {  
114             system.out.println("sorry file does not exists!");  
115         } else {  
116             if (f.getname().endswith(".docx") || f.getname().endswith(".docx")) {  
117                 try {
118                     // 1) 加载word文档生成 xwpfdocument对象  
119                     inputstream in = new fileinputstream(f);  
120                     xwpfdocument document = new xwpfdocument(in);  
121       
122                     // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录)  
123                     file imagefolderfile = new file(filepath);  
124                     xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile));  
125                     options.setextractor(new fileimageextractor(imagefolderfile));  
126                     options.uriresolver(new iuriresolver() {
127                         public string resolve(string uri) {
128                             //http://192.168.30.222:8010//uploadfile/....
129                             return "http://"+ip+":"+port+"//uploadfile/"+strranstring +"/"+ uri;
130                         }
131                     });
132                     
133                     options.setignorestylesifunused(false);  
134                     options.setfragment(true);  
135                       
136                     // 3) 将 xwpfdocument转换成xhtml  
137                     outputstream out = new fileoutputstream(new file(filepath + htmlname));  
138                     ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance();
139                     converter.convert(document,out, options);
140                     //xhtmlconverter.getinstance().convert(document, out, options);  
141                     system.out.println("html路径:"+"http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
142                 } catch (exception e) {
143                     e.printstacktrace();
144                 }
145             
146             } else {  
147                 system.out.println("enter only ms office 2007+ files");  
148             }  
149         }  
150     }  
151 
152      /**
153      *功能说明:生成时间戳
154      *创建人:zsq
155      *创建时间:2019年12月7日 下午2:37:09
156      *
157      */
158      public static string getrandomnum(){
159          date dt = new date();
160          simpledateformat sdf = new simpledateformat("yyyymmddhhmmss");  
161          string str=sdf.format(dt);
162          return str;
163      }
164      
165    }

二:java实现将pdf转换为html

1: 引入依赖

 1 <dependency>
 2             <groupid>net.sf.cssbox</groupid>
 3             <artifactid>pdf2dom</artifactid>
 4             <version>1.7</version>
 5         </dependency> 
 6         <dependency>
 7             <groupid>org.apache.pdfbox</groupid>
 8             <artifactid>pdfbox</artifactid>
 9             <version>2.0.12</version>
10         </dependency>
11         <dependency>
12             <groupid>org.apache.pdfbox</groupid>
13             <artifactid>pdfbox-tools</artifactid>
14             <version>2.0.12</version>
15  </dependency>
16

2:代码demo

 1 public class pdftohtml {
 2 
 3   /*
 4     pdf转换html
 5      */
 6     public void pdftohtmltest(string inpdfpath,string outputhtmlpath)  {
 7        // string outputpath = "c:\\works\\files\\zsq保密知识测试题库.html";
 8     9        //try() 写在()里面会自动关闭流
10         try{
11             bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),"utf-8"));
12             //加载pdf文档
13             //pddocument document = pddocument.load(bytes);
14             pddocument document = pddocument.load(new file(inpdfpath));
15             pdfdomtree pdfdomtree = new pdfdomtree();
16             pdfdomtree.writetext(document,out);
17         } catch (exception e) {
18             e.printstacktrace();
19         }
20     }
21 
22     public static void main(string[] args) throws ioexception {
23         pdftohtml ph=new pdftohtml();
24         string pdfpath="c:\\works\\files\\武研中心行政考勤制度.pdf";
25         string outputpath="c:\\works\\files\\武研中心行政考勤制度.html";
26         ph.pdftohtmltest(pdfpath,outputpath);
27   }
28 
29 }

三:java实现将txt转换为html

 1  /*
 2      * txt文档转html
 3        filepath:txt原文件路径
 4        htmlposition:转化后生成的html路径
 5     */
 6     public static void txttohtml(string filepath, string htmlposition) {
 7         try {
 8             //string encoding = "gbk";
 9             file file = new file(filepath);
10             if (file.isfile() && file.exists()) { // 判断文件是否存在
11                 inputstreamreader read = new inputstreamreader(new fileinputstream(file), "gbk");
12                 // 考虑到编码格式
13                 bufferedreader bufferedreader = new bufferedreader(read);
14                 // 写文件
15                 fileoutputstream fos = new fileoutputstream(new file(htmlposition));
16                 outputstreamwriter osw = new outputstreamwriter(fos, "gbk");
17                 bufferedwriter bw = new bufferedwriter(osw);
18                 string linetxt = null;
19                 while ((linetxt = bufferedreader.readline()) != null) {
20                     bw.write("&nbsp&nbsp&nbsp"+linetxt + "</br>");
21                 }
22                 bw.close();
23                 osw.close();
24                 fos.close();
25                 read.close();
26             } else {
27                 system.out.println("找不到指定的文件");
28             }
29         } catch (exception e) {
30             system.out.println("读取文件内容出错");
31             e.printstacktrace();
32         }
33     }

上一篇： java 虚拟机原理

下一篇： Python自动群发邮件，只需20行代码！

Java实现Word/Pdf/TXT转html

纯js实现html转pdf的简单实例(推荐)

纯js实现html转pdf的简单实例(推荐)

js实现html转img、pdf的方法（代码教程）

php实现word转html的方法

Python实现批量将word转html并将html内容发布至网站的方法

C#实现HTML转WORD及WORD转PDF的方法

python实现pdf转换成word/txt纯文本文件

Java实现Word/Pdf/TXT转html

Asp.net实现直接在浏览器预览Word、Excel、PDF、Txt文件（附源码）

php用windows COM组件调用openoffice接口实现word转pdf文件时报错的解决办法