欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

Java实现Word/Pdf/TXT转html

程序员文章站 2023-09-29 08:01:10
引言: 最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人 ......

引言:

    最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成html文件,以便在网页上能够浏览学习

 下边主要针对word,pdf和txt文本文件进行转换

一:java实现将word转换为html

   1:引入依赖

 1 <dependency>
 2   <groupid>fr.opensagres.xdocreport</groupid>
 3   <artifactid>fr.opensagres.xdocreport.document</artifactid>
 4   <version>1.0.5</version>
 5 </dependency>
 6 <dependency> 
 7   <groupid>fr.opensagres.xdocreport</groupid> 
 8   <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid> 
 9   <version>1.0.5</version> 
10 </dependency>
11   <dependency>
12   <groupid>org.apache.poi</groupid>
13   <artifactid>poi</artifactid>
14   <version>3.12</version>
15 </dependency>
16 <dependency>
17   <groupid>org.apache.poi</groupid>
18   <artifactid>poi-scratchpad</artifactid>
19   <version>3.12</version>
20 </dependency>

  2:代码demo

 

  1 package com.svse.controller;
  2 
  3 import javax.xml.parsers.documentbuilderfactory;
  4 import javax.xml.parsers.parserconfigurationexception;
  5 import javax.xml.transform.outputkeys;
  6 import javax.xml.transform.transformer;
  7 import javax.xml.transform.transformerexception;
  8 import javax.xml.transform.transformerfactory;
  9 import javax.xml.transform.dom.domsource;
 10 import javax.xml.transform.stream.streamresult;
 11 
 12 import org.apache.poi.hwpf.hwpfdocument;
 13 import org.apache.poi.hwpf.converter.picturesmanager;
 14 import org.apache.poi.hwpf.converter.wordtohtmlconverter;
 15 import org.apache.poi.hwpf.usermodel.picturetype;
 16 import org.apache.poi.xwpf.converter.core.basicuriresolver;
 17 import org.apache.poi.xwpf.converter.core.fileimageextractor;
 18 import org.apache.poi.xwpf.converter.core.fileuriresolver;
 19 import org.apache.poi.xwpf.converter.core.iuriresolver;
 20 import org.apache.poi.xwpf.converter.core.ixwpfconverter;
 21 import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter;
 22 import org.apache.poi.xwpf.converter.xhtml.xhtmloptions;
 23 import org.apache.poi.xwpf.usermodel.xwpfdocument;
 24 /**
 25  * word 转换成html
 26  */
 27 public class testwordtohtml {
 28 
 29     public static  final string storagepath="c://works//files//";
 30     public static  final string ip="192.168.30.222";
 31     public static  final string port="8010";
 32     public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception {
 33         testwordtohtml wt=new testwordtohtml();
 34         //wt.word2003tohtml("甲骨文考证.doc");
 35         wt.word2007tohtml("甲骨文考证.docx");
 36 
 37     }
 38       
 39      /**
 40      * 2003版本word转换成html
 41      * @throws ioexception
 42      * @throws transformerexception
 43      * @throws parserconfigurationexception
 44      */
 45     public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception {
 46        
 47         final string imagepath = storagepath+"fileimage/";//解析时候如果doc文件中有图片  图片会保存在此路径
 48         final string strranstring=getrandomnum();
 49         string filepath =storagepath;
 50         string htmlname =filename.substring(0, filename.indexof("."))+ "2003.html";
 51         final string file = filepath + filename;
 52         inputstream input = new fileinputstream(new file(file));
 53         hwpfdocument worddocument = new hwpfdocument(input);
 54         wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
 55         //设置图片存放的位置
 56         wordtohtmlconverter.setpicturesmanager(new picturesmanager() {
 57             public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) {
 58                 file imgpath = new file(imagepath);
 59                 if(!imgpath.exists()){//图片目录不存在则创建
 60                     imgpath.mkdirs();
 61                 }
 62                 
 63                 file file = new file(imagepath +strranstring+suggestedname);
 64                 try {
 65                     outputstream os = new fileoutputstream(file);
 66                     os.write(content);
 67                     os.close();
 68                 } catch (filenotfoundexception e) {
 69                     e.printstacktrace();
 70                 } catch (ioexception e) {
 71                     e.printstacktrace();
 72                 }
 73                 
 74                 return  "http://"+ip+":"+port+"//uploadfile/fileimage/"+strranstring+suggestedname;
 75                // return imagepath +strranstring+suggestedname;
 76             }
 77         });
 78         
 79         //解析word文档
 80         wordtohtmlconverter.processdocument(worddocument);
 81         document htmldocument = wordtohtmlconverter.getdocument();
 82         
 83         file htmlfile = new file(filepath +strranstring+htmlname);
 84         outputstream outstream = new fileoutputstream(htmlfile);
 85         
 86 
 87         domsource domsource = new domsource(htmldocument);
 88         streamresult streamresult = new streamresult(outstream);
 89 
 90         transformerfactory factory = transformerfactory.newinstance();
 91         transformer serializer = factory.newtransformer();
 92         serializer.setoutputproperty(outputkeys.encoding, "utf-8");
 93         serializer.setoutputproperty(outputkeys.indent, "yes");
 94         serializer.setoutputproperty(outputkeys.method, "html");
 95         
 96         serializer.transform(domsource, streamresult);
 97         outstream.close();
 98         
 99         system.out.println("生成html文件路径:"+ "http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
100     }
101 
102     /**
103      * 2007版本word转换成html
104      * @throws ioexception
105      */
106     public void word2007tohtml(string filename) throws ioexception {
107         
108        final string strranstring=getrandomnum();
109         
110         string filepath = storagepath+strranstring;
111         string htmlname =filename.substring(0, filename.indexof("."))+ "2007.html";
112         file f = new file(storagepath+filename);  
113         if (!f.exists()) {  
114             system.out.println("sorry file does not exists!");  
115         } else {  
116             if (f.getname().endswith(".docx") || f.getname().endswith(".docx")) {  
117                 try {
118                     // 1) 加载word文档生成 xwpfdocument对象  
119                     inputstream in = new fileinputstream(f);  
120                     xwpfdocument document = new xwpfdocument(in);  
121       
122                     // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录)  
123                     file imagefolderfile = new file(filepath);  
124                     xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile));  
125                     options.setextractor(new fileimageextractor(imagefolderfile));  
126                     options.uriresolver(new iuriresolver() {
127                         public string resolve(string uri) {
128                             //http://192.168.30.222:8010//uploadfile/....
129                             return "http://"+ip+":"+port+"//uploadfile/"+strranstring +"/"+ uri;
130                         }
131                     });
132                     
133                     options.setignorestylesifunused(false);  
134                     options.setfragment(true);  
135                       
136                     // 3) 将 xwpfdocument转换成xhtml  
137                     outputstream out = new fileoutputstream(new file(filepath + htmlname));  
138                     ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance();
139                     converter.convert(document,out, options);
140                     //xhtmlconverter.getinstance().convert(document, out, options);  
141                     system.out.println("html路径:"+"http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
142                 } catch (exception e) {
143                     e.printstacktrace();
144                 }
145             
146             } else {  
147                 system.out.println("enter only ms office 2007+ files");  
148             }  
149         }  
150     }  
151 
152      /**
153      *功能说明:生成时间戳
154      *创建人:zsq
155      *创建时间:2019年12月7日 下午2:37:09
156      *
157      */
158      public static string getrandomnum(){
159          date dt = new date();
160          simpledateformat sdf = new simpledateformat("yyyymmddhhmmss");  
161          string str=sdf.format(dt);
162          return str;
163      }
164      
165    } 

二:java实现将pdf转换为html

  1: 引入依赖

 1 <dependency>
 2             <groupid>net.sf.cssbox</groupid>
 3             <artifactid>pdf2dom</artifactid>
 4             <version>1.7</version>
 5         </dependency> 
 6         <dependency>
 7             <groupid>org.apache.pdfbox</groupid>
 8             <artifactid>pdfbox</artifactid>
 9             <version>2.0.12</version>
10         </dependency>
11         <dependency>
12             <groupid>org.apache.pdfbox</groupid>
13             <artifactid>pdfbox-tools</artifactid>
14             <version>2.0.12</version>
15  </dependency>
16         

2:代码demo

 1 public class pdftohtml {
 2 
 3   /*
 4     pdf转换html
 5      */
 6     public void pdftohtmltest(string inpdfpath,string outputhtmlpath)  {
 7        // string outputpath = "c:\\works\\files\\zsq保密知识测试题库.html";
 8     9        //try() 写在()里面会自动关闭流
10         try{
11             bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),"utf-8"));
12             //加载pdf文档
13             //pddocument document = pddocument.load(bytes);
14             pddocument document = pddocument.load(new file(inpdfpath));
15             pdfdomtree pdfdomtree = new pdfdomtree();
16             pdfdomtree.writetext(document,out);
17         } catch (exception e) {
18             e.printstacktrace();
19         }
20     }
21 
22     public static void main(string[] args) throws ioexception {
23         pdftohtml ph=new pdftohtml();
24         string pdfpath="c:\\works\\files\\武研中心行政考勤制度.pdf";
25         string outputpath="c:\\works\\files\\武研中心行政考勤制度.html";
26         ph.pdftohtmltest(pdfpath,outputpath);
27   }
28 
29 }

三:java实现将txt转换为html

 1  /*
 2      * txt文档转html
 3        filepath:txt原文件路径
 4        htmlposition:转化后生成的html路径
 5     */
 6     public static void txttohtml(string filepath, string htmlposition) {
 7         try {
 8             //string encoding = "gbk";
 9             file file = new file(filepath);
10             if (file.isfile() && file.exists()) { // 判断文件是否存在
11                 inputstreamreader read = new inputstreamreader(new fileinputstream(file), "gbk");
12                 // 考虑到编码格式
13                 bufferedreader bufferedreader = new bufferedreader(read);
14                 // 写文件
15                 fileoutputstream fos = new fileoutputstream(new file(htmlposition));
16                 outputstreamwriter osw = new outputstreamwriter(fos, "gbk");
17                 bufferedwriter bw = new bufferedwriter(osw);
18                 string linetxt = null;
19                 while ((linetxt = bufferedreader.readline()) != null) {
20                     bw.write("&nbsp&nbsp&nbsp"+linetxt + "</br>");
21                 }
22                 bw.close();
23                 osw.close();
24                 fos.close();
25                 read.close();
26             } else {
27                 system.out.println("找不到指定的文件");
28             }
29         } catch (exception e) {
30             system.out.println("读取文件内容出错");
31             e.printstacktrace();
32         }
33     }