Java实现Word/Pdf/TXT转html
程序员文章站
2023-09-29 08:01:10
引言: 最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人 ......
引言:
最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成html文件,以便在网页上能够浏览学习
下边主要针对word,pdf和txt文本文件进行转换
一:java实现将word转换为html
1:引入依赖
1 <dependency> 2 <groupid>fr.opensagres.xdocreport</groupid> 3 <artifactid>fr.opensagres.xdocreport.document</artifactid> 4 <version>1.0.5</version> 5 </dependency> 6 <dependency> 7 <groupid>fr.opensagres.xdocreport</groupid> 8 <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid> 9 <version>1.0.5</version> 10 </dependency> 11 <dependency> 12 <groupid>org.apache.poi</groupid> 13 <artifactid>poi</artifactid> 14 <version>3.12</version> 15 </dependency> 16 <dependency> 17 <groupid>org.apache.poi</groupid> 18 <artifactid>poi-scratchpad</artifactid> 19 <version>3.12</version> 20 </dependency>
2:代码demo
1 package com.svse.controller; 2 3 import javax.xml.parsers.documentbuilderfactory; 4 import javax.xml.parsers.parserconfigurationexception; 5 import javax.xml.transform.outputkeys; 6 import javax.xml.transform.transformer; 7 import javax.xml.transform.transformerexception; 8 import javax.xml.transform.transformerfactory; 9 import javax.xml.transform.dom.domsource; 10 import javax.xml.transform.stream.streamresult; 11 12 import org.apache.poi.hwpf.hwpfdocument; 13 import org.apache.poi.hwpf.converter.picturesmanager; 14 import org.apache.poi.hwpf.converter.wordtohtmlconverter; 15 import org.apache.poi.hwpf.usermodel.picturetype; 16 import org.apache.poi.xwpf.converter.core.basicuriresolver; 17 import org.apache.poi.xwpf.converter.core.fileimageextractor; 18 import org.apache.poi.xwpf.converter.core.fileuriresolver; 19 import org.apache.poi.xwpf.converter.core.iuriresolver; 20 import org.apache.poi.xwpf.converter.core.ixwpfconverter; 21 import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter; 22 import org.apache.poi.xwpf.converter.xhtml.xhtmloptions; 23 import org.apache.poi.xwpf.usermodel.xwpfdocument; 24 /** 25 * word 转换成html 26 */ 27 public class testwordtohtml { 28 29 public static final string storagepath="c://works//files//"; 30 public static final string ip="192.168.30.222"; 31 public static final string port="8010"; 32 public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception { 33 testwordtohtml wt=new testwordtohtml(); 34 //wt.word2003tohtml("甲骨文考证.doc"); 35 wt.word2007tohtml("甲骨文考证.docx"); 36 37 } 38 39 /** 40 * 2003版本word转换成html 41 * @throws ioexception 42 * @throws transformerexception 43 * @throws parserconfigurationexception 44 */ 45 public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception { 46 47 final string imagepath = storagepath+"fileimage/";//解析时候如果doc文件中有图片 图片会保存在此路径 48 final string strranstring=getrandomnum(); 49 string filepath =storagepath; 50 string htmlname =filename.substring(0, filename.indexof("."))+ "2003.html"; 51 final string file = filepath + filename; 52 inputstream input = new fileinputstream(new file(file)); 53 hwpfdocument worddocument = new hwpfdocument(input); 54 wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument()); 55 //设置图片存放的位置 56 wordtohtmlconverter.setpicturesmanager(new picturesmanager() { 57 public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) { 58 file imgpath = new file(imagepath); 59 if(!imgpath.exists()){//图片目录不存在则创建 60 imgpath.mkdirs(); 61 } 62 63 file file = new file(imagepath +strranstring+suggestedname); 64 try { 65 outputstream os = new fileoutputstream(file); 66 os.write(content); 67 os.close(); 68 } catch (filenotfoundexception e) { 69 e.printstacktrace(); 70 } catch (ioexception e) { 71 e.printstacktrace(); 72 } 73 74 return "http://"+ip+":"+port+"//uploadfile/fileimage/"+strranstring+suggestedname; 75 // return imagepath +strranstring+suggestedname; 76 } 77 }); 78 79 //解析word文档 80 wordtohtmlconverter.processdocument(worddocument); 81 document htmldocument = wordtohtmlconverter.getdocument(); 82 83 file htmlfile = new file(filepath +strranstring+htmlname); 84 outputstream outstream = new fileoutputstream(htmlfile); 85 86 87 domsource domsource = new domsource(htmldocument); 88 streamresult streamresult = new streamresult(outstream); 89 90 transformerfactory factory = transformerfactory.newinstance(); 91 transformer serializer = factory.newtransformer(); 92 serializer.setoutputproperty(outputkeys.encoding, "utf-8"); 93 serializer.setoutputproperty(outputkeys.indent, "yes"); 94 serializer.setoutputproperty(outputkeys.method, "html"); 95 96 serializer.transform(domsource, streamresult); 97 outstream.close(); 98 99 system.out.println("生成html文件路径:"+ "http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname); 100 } 101 102 /** 103 * 2007版本word转换成html 104 * @throws ioexception 105 */ 106 public void word2007tohtml(string filename) throws ioexception { 107 108 final string strranstring=getrandomnum(); 109 110 string filepath = storagepath+strranstring; 111 string htmlname =filename.substring(0, filename.indexof("."))+ "2007.html"; 112 file f = new file(storagepath+filename); 113 if (!f.exists()) { 114 system.out.println("sorry file does not exists!"); 115 } else { 116 if (f.getname().endswith(".docx") || f.getname().endswith(".docx")) { 117 try { 118 // 1) 加载word文档生成 xwpfdocument对象 119 inputstream in = new fileinputstream(f); 120 xwpfdocument document = new xwpfdocument(in); 121 122 // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录) 123 file imagefolderfile = new file(filepath); 124 xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile)); 125 options.setextractor(new fileimageextractor(imagefolderfile)); 126 options.uriresolver(new iuriresolver() { 127 public string resolve(string uri) { 128 //http://192.168.30.222:8010//uploadfile/.... 129 return "http://"+ip+":"+port+"//uploadfile/"+strranstring +"/"+ uri; 130 } 131 }); 132 133 options.setignorestylesifunused(false); 134 options.setfragment(true); 135 136 // 3) 将 xwpfdocument转换成xhtml 137 outputstream out = new fileoutputstream(new file(filepath + htmlname)); 138 ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance(); 139 converter.convert(document,out, options); 140 //xhtmlconverter.getinstance().convert(document, out, options); 141 system.out.println("html路径:"+"http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname); 142 } catch (exception e) { 143 e.printstacktrace(); 144 } 145 146 } else { 147 system.out.println("enter only ms office 2007+ files"); 148 } 149 } 150 } 151 152 /** 153 *功能说明:生成时间戳 154 *创建人:zsq 155 *创建时间:2019年12月7日 下午2:37:09 156 * 157 */ 158 public static string getrandomnum(){ 159 date dt = new date(); 160 simpledateformat sdf = new simpledateformat("yyyymmddhhmmss"); 161 string str=sdf.format(dt); 162 return str; 163 } 164 165 }
二:java实现将pdf转换为html
1: 引入依赖
1 <dependency> 2 <groupid>net.sf.cssbox</groupid> 3 <artifactid>pdf2dom</artifactid> 4 <version>1.7</version> 5 </dependency> 6 <dependency> 7 <groupid>org.apache.pdfbox</groupid> 8 <artifactid>pdfbox</artifactid> 9 <version>2.0.12</version> 10 </dependency> 11 <dependency> 12 <groupid>org.apache.pdfbox</groupid> 13 <artifactid>pdfbox-tools</artifactid> 14 <version>2.0.12</version> 15 </dependency> 16
2:代码demo
1 public class pdftohtml { 2 3 /* 4 pdf转换html 5 */ 6 public void pdftohtmltest(string inpdfpath,string outputhtmlpath) { 7 // string outputpath = "c:\\works\\files\\zsq保密知识测试题库.html"; 8 9 //try() 写在()里面会自动关闭流 10 try{ 11 bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),"utf-8")); 12 //加载pdf文档 13 //pddocument document = pddocument.load(bytes); 14 pddocument document = pddocument.load(new file(inpdfpath)); 15 pdfdomtree pdfdomtree = new pdfdomtree(); 16 pdfdomtree.writetext(document,out); 17 } catch (exception e) { 18 e.printstacktrace(); 19 } 20 } 21 22 public static void main(string[] args) throws ioexception { 23 pdftohtml ph=new pdftohtml(); 24 string pdfpath="c:\\works\\files\\武研中心行政考勤制度.pdf"; 25 string outputpath="c:\\works\\files\\武研中心行政考勤制度.html"; 26 ph.pdftohtmltest(pdfpath,outputpath); 27 } 28 29 }
三:java实现将txt转换为html
1 /* 2 * txt文档转html 3 filepath:txt原文件路径 4 htmlposition:转化后生成的html路径 5 */ 6 public static void txttohtml(string filepath, string htmlposition) { 7 try { 8 //string encoding = "gbk"; 9 file file = new file(filepath); 10 if (file.isfile() && file.exists()) { // 判断文件是否存在 11 inputstreamreader read = new inputstreamreader(new fileinputstream(file), "gbk"); 12 // 考虑到编码格式 13 bufferedreader bufferedreader = new bufferedreader(read); 14 // 写文件 15 fileoutputstream fos = new fileoutputstream(new file(htmlposition)); 16 outputstreamwriter osw = new outputstreamwriter(fos, "gbk"); 17 bufferedwriter bw = new bufferedwriter(osw); 18 string linetxt = null; 19 while ((linetxt = bufferedreader.readline()) != null) { 20 bw.write("   "+linetxt + "</br>"); 21 } 22 bw.close(); 23 osw.close(); 24 fos.close(); 25 read.close(); 26 } else { 27 system.out.println("找不到指定的文件"); 28 } 29 } catch (exception e) { 30 system.out.println("读取文件内容出错"); 31 e.printstacktrace(); 32 } 33 }
上一篇: java 虚拟机原理
推荐阅读