Java实现Word/Pdf/TXT转html的示例

程序员文章站 2022-03-28 17:42:14

引言: 最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型...

引言:

最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成html文件,以便在网页上能够浏览学习

下边主要针对word,pdf和txt文本文件进行转换

一:java实现将word转换为html

1:引入依赖

<dependency>
 <groupid>fr.opensagres.xdocreport</groupid>
 <artifactid>fr.opensagres.xdocreport.document</artifactid>
 <version>1.0.5</version>
</dependency>
<dependency>
 <groupid>fr.opensagres.xdocreport</groupid>
 <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid>
 <version>1.0.5</version>
</dependency>
 <dependency>
 <groupid>org.apache.poi</groupid>
 <artifactid>poi</artifactid>
 <version>3.12</version>
</dependency>
<dependency>
 <groupid>org.apache.poi</groupid>
 <artifactid>poi-scratchpad</artifactid>
 <version>3.12</version>
</dependency>

2:代码demo

package com.svse.controller;

import javax.xml.parsers.documentbuilderfactory;
import javax.xml.parsers.parserconfigurationexception;
import javax.xml.transform.outputkeys;
import javax.xml.transform.transformer;
import javax.xml.transform.transformerexception;
import javax.xml.transform.transformerfactory;
import javax.xml.transform.dom.domsource;
import javax.xml.transform.stream.streamresult;

import org.apache.poi.hwpf.hwpfdocument;
import org.apache.poi.hwpf.converter.picturesmanager;
import org.apache.poi.hwpf.converter.wordtohtmlconverter;
import org.apache.poi.hwpf.usermodel.picturetype;
import org.apache.poi.xwpf.converter.core.basicuriresolver;
import org.apache.poi.xwpf.converter.core.fileimageextractor;
import org.apache.poi.xwpf.converter.core.fileuriresolver;
import org.apache.poi.xwpf.converter.core.iuriresolver;
import org.apache.poi.xwpf.converter.core.ixwpfconverter;
import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter;
import org.apache.poi.xwpf.converter.xhtml.xhtmloptions;
import org.apache.poi.xwpf.usermodel.xwpfdocument;
/**
 * word 转换成html
 */
public class testwordtohtml {

 public static final string storagepath="c://works//files//";
 public static final string ip="192.168.30.222";
 public static final string port="8010";
 public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception {
  testwordtohtml wt=new testwordtohtml();
  //wt.word2003tohtml("甲骨文考证.doc");
  wt.word2007tohtml("甲骨文考证.docx");

 }

  /**
  * 2003版本word转换成html
  * @throws ioexception
  * @throws transformerexception
  * @throws parserconfigurationexception
  */
 public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception {

  final string imagepath = storagepath+"fileimage/";//解析时候如果doc文件中有图片 图片会保存在此路径
  final string strranstring=getrandomnum();
  string filepath =storagepath;
  string htmlname =filename.substring(0, filename.indexof("."))+ "2003.html";
  final string file = filepath + filename;
  inputstream input = new fileinputstream(new file(file));
  hwpfdocument worddocument = new hwpfdocument(input);
  wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
  //设置图片存放的位置
  wordtohtmlconverter.setpicturesmanager(new picturesmanager() {
   public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) {
    file imgpath = new file(imagepath);
    if(!imgpath.exists()){//图片目录不存在则创建
     imgpath.mkdirs();
    }

    file file = new file(imagepath +strranstring+suggestedname);
    try {
     outputstream os = new fileoutputstream(file);
     os.write(content);
     os.close();
    } catch (filenotfoundexception e) {
     e.printstacktrace();
    } catch (ioexception e) {
     e.printstacktrace();
    }

    return "http://"+ip+":"+port+"//uploadfile/fileimage/"+strranstring+suggestedname;
    // return imagepath +strranstring+suggestedname;
   }
  });

  //解析word文档
  wordtohtmlconverter.processdocument(worddocument);
  document htmldocument = wordtohtmlconverter.getdocument();

  file htmlfile = new file(filepath +strranstring+htmlname);
  outputstream outstream = new fileoutputstream(htmlfile);


  domsource domsource = new domsource(htmldocument);
  streamresult streamresult = new streamresult(outstream);

  transformerfactory factory = transformerfactory.newinstance();
  transformer serializer = factory.newtransformer();
  serializer.setoutputproperty(outputkeys.encoding, "utf-8");
  serializer.setoutputproperty(outputkeys.indent, "yes");
  serializer.setoutputproperty(outputkeys.method, "html");

  serializer.transform(domsource, streamresult);
  outstream.close();

  system.out.println("生成html文件路径:"+ "http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
 }

 /**
  * 2007版本word转换成html
  * @throws ioexception
  */
 public void word2007tohtml(string filename) throws ioexception {

  final string strranstring=getrandomnum();

  string filepath = storagepath+strranstring;
  string htmlname =filename.substring(0, filename.indexof("."))+ "2007.html";
  file f = new file(storagepath+filename);
  if (!f.exists()) {
   system.out.println("sorry file does not exists!");
  } else {
   if (f.getname().endswith(".docx") || f.getname().endswith(".docx")) {
    try {
     // 1) 加载word文档生成 xwpfdocument对象
     inputstream in = new fileinputstream(f);
     xwpfdocument document = new xwpfdocument(in);

     // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录)
     file imagefolderfile = new file(filepath);
     xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile));
     options.setextractor(new fileimageextractor(imagefolderfile));
     options.uriresolver(new iuriresolver() {
      public string resolve(string uri) {
       //http://192.168.30.222:8010//uploadfile/....
       return "http://"+ip+":"+port+"//uploadfile/"+strranstring +"/"+ uri;
      }
     });

     options.setignorestylesifunused(false);
     options.setfragment(true);

     // 3) 将 xwpfdocument转换成xhtml
     outputstream out = new fileoutputstream(new file(filepath + htmlname));
     ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance();
     converter.convert(document,out, options);
     //xhtmlconverter.getinstance().convert(document, out, options);
     system.out.println("html路径:"+"http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
    } catch (exception e) {
     e.printstacktrace();
    }

   } else {
    system.out.println("enter only ms office 2007+ files");
   }
  }
 }

  /**
  *功能说明:生成时间戳
  *创建人:zsq
  *创建时间:2019年12月7日 下午2:37:09
  *
  */
  public static string getrandomnum(){
   date dt = new date();
   simpledateformat sdf = new simpledateformat("yyyymmddhhmmss");
   string str=sdf.format(dt);
   return str;
  }

 }

二:java实现将pdf转换为html

1: 引入依赖

<dependency>
   <groupid>net.sf.cssbox</groupid>
   <artifactid>pdf2dom</artifactid>
   <version>1.7</version>
  </dependency>
  <dependency>
   <groupid>org.apache.pdfbox</groupid>
   <artifactid>pdfbox</artifactid>
   <version>2.0.12</version>
  </dependency>
  <dependency>
   <groupid>org.apache.pdfbox</groupid>
   <artifactid>pdfbox-tools</artifactid>
   <version>2.0.12</version>
 </dependency>

2:代码demo

 public class pdftohtml {
 
 /*
  pdf转换html
  */
  public void pdftohtmltest(string inpdfpath,string outputhtmlpath) {
   // string outputpath = "c:\\works\\files\\zsq保密知识测试题库.html";
    //try() 写在()里面会自动关闭流
   try{
    bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),"utf-8"));
    //加载pdf文档
    //pddocument document = pddocument.load(bytes);
    pddocument document = pddocument.load(new file(inpdfpath));
    pdfdomtree pdfdomtree = new pdfdomtree();
    pdfdomtree.writetext(document,out);
   } catch (exception e) {
    e.printstacktrace();
   }
  }
 
  public static void main(string[] args) throws ioexception {
   pdftohtml ph=new pdftohtml();
   string pdfpath="c:\\works\\files\\武研中心行政考勤制度.pdf";
   string outputpath="c:\\works\\files\\武研中心行政考勤制度.html";
   ph.pdftohtmltest(pdfpath,outputpath);
 }
 
 }

三:java实现将txt转换为html

/*
  * txt文档转html
  filepath:txt原文件路径
  htmlposition:转化后生成的html路径
 */
 public static void txttohtml(string filepath, string htmlposition) {
  try {
   //string encoding = "gbk";
   file file = new file(filepath);
   if (file.isfile() && file.exists()) { // 判断文件是否存在
    inputstreamreader read = new inputstreamreader(new fileinputstream(file), "gbk");
    // 考虑到编码格式
    bufferedreader bufferedreader = new bufferedreader(read);
    // 写文件
    fileoutputstream fos = new fileoutputstream(new file(htmlposition));
    outputstreamwriter osw = new outputstreamwriter(fos, "gbk");
    bufferedwriter bw = new bufferedwriter(osw);
    string linetxt = null;
    while ((linetxt = bufferedreader.readline()) != null) {
     bw.write("&nbsp&nbsp&nbsp"+linetxt + "</br>");
    }
    bw.close();
    osw.close();
    fos.close();
    read.close();
   } else {
    system.out.println("找不到指定的文件");
   }
  } catch (exception e) {
   system.out.println("读取文件内容出错");
   e.printstacktrace();
  }
 }

以上就是java实现word/pdf/txt转html的示例的详细内容，更多关于java word/pdf/txt转html的资料请关注其它相关文章！

Java实现Word/Pdf/TXT转html的示例

纯js实现html转pdf的简单实例(推荐)

纯js实现html转pdf的简单实例(推荐)

Java编程实现的二维数组转置功能示例

js实现html转img、pdf的方法（代码教程）

php实现word转html的方法

Python实现批量将word转html并将html内容发布至网站的方法

C#实现HTML转WORD及WORD转PDF的方法

Java实现Word/Pdf/TXT转html

php用windows COM组件调用openoffice接口实现word转pdf文件时报错的解决办法

Java实现Word/Excel/TXT转PDF