欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

Java实现Word/Pdf/TXT转html的示例

程序员文章站 2022-03-28 17:42:14
引言: 最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型...

引言:

    最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成html文件,以便在网页上能够浏览学习

 下边主要针对word,pdf和txt文本文件进行转换

一:java实现将word转换为html

1:引入依赖

<dependency>
 <groupid>fr.opensagres.xdocreport</groupid>
 <artifactid>fr.opensagres.xdocreport.document</artifactid>
 <version>1.0.5</version>
</dependency>
<dependency>
 <groupid>fr.opensagres.xdocreport</groupid>
 <artifactid>org.apache.poi.xwpf.converter.xhtml</artifactid>
 <version>1.0.5</version>
</dependency>
 <dependency>
 <groupid>org.apache.poi</groupid>
 <artifactid>poi</artifactid>
 <version>3.12</version>
</dependency>
<dependency>
 <groupid>org.apache.poi</groupid>
 <artifactid>poi-scratchpad</artifactid>
 <version>3.12</version>
</dependency>

2:代码demo

package com.svse.controller;

import javax.xml.parsers.documentbuilderfactory;
import javax.xml.parsers.parserconfigurationexception;
import javax.xml.transform.outputkeys;
import javax.xml.transform.transformer;
import javax.xml.transform.transformerexception;
import javax.xml.transform.transformerfactory;
import javax.xml.transform.dom.domsource;
import javax.xml.transform.stream.streamresult;

import org.apache.poi.hwpf.hwpfdocument;
import org.apache.poi.hwpf.converter.picturesmanager;
import org.apache.poi.hwpf.converter.wordtohtmlconverter;
import org.apache.poi.hwpf.usermodel.picturetype;
import org.apache.poi.xwpf.converter.core.basicuriresolver;
import org.apache.poi.xwpf.converter.core.fileimageextractor;
import org.apache.poi.xwpf.converter.core.fileuriresolver;
import org.apache.poi.xwpf.converter.core.iuriresolver;
import org.apache.poi.xwpf.converter.core.ixwpfconverter;
import org.apache.poi.xwpf.converter.xhtml.xhtmlconverter;
import org.apache.poi.xwpf.converter.xhtml.xhtmloptions;
import org.apache.poi.xwpf.usermodel.xwpfdocument;
/**
 * word 转换成html
 */
public class testwordtohtml {

 public static final string storagepath="c://works//files//";
 public static final string ip="192.168.30.222";
 public static final string port="8010";
 public static void main(string[] args) throws ioexception, transformerexception, parserconfigurationexception {
  testwordtohtml wt=new testwordtohtml();
  //wt.word2003tohtml("甲骨文考证.doc");
  wt.word2007tohtml("甲骨文考证.docx");

 }

  /**
  * 2003版本word转换成html
  * @throws ioexception
  * @throws transformerexception
  * @throws parserconfigurationexception
  */
 public void word2003tohtml(string filename) throws ioexception, transformerexception, parserconfigurationexception {

  final string imagepath = storagepath+"fileimage/";//解析时候如果doc文件中有图片 图片会保存在此路径
  final string strranstring=getrandomnum();
  string filepath =storagepath;
  string htmlname =filename.substring(0, filename.indexof("."))+ "2003.html";
  final string file = filepath + filename;
  inputstream input = new fileinputstream(new file(file));
  hwpfdocument worddocument = new hwpfdocument(input);
  wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
  //设置图片存放的位置
  wordtohtmlconverter.setpicturesmanager(new picturesmanager() {
   public string savepicture(byte[] content, picturetype picturetype, string suggestedname, float widthinches, float heightinches) {
    file imgpath = new file(imagepath);
    if(!imgpath.exists()){//图片目录不存在则创建
     imgpath.mkdirs();
    }

    file file = new file(imagepath +strranstring+suggestedname);
    try {
     outputstream os = new fileoutputstream(file);
     os.write(content);
     os.close();
    } catch (filenotfoundexception e) {
     e.printstacktrace();
    } catch (ioexception e) {
     e.printstacktrace();
    }

    return "http://"+ip+":"+port+"//uploadfile/fileimage/"+strranstring+suggestedname;
    // return imagepath +strranstring+suggestedname;
   }
  });

  //解析word文档
  wordtohtmlconverter.processdocument(worddocument);
  document htmldocument = wordtohtmlconverter.getdocument();

  file htmlfile = new file(filepath +strranstring+htmlname);
  outputstream outstream = new fileoutputstream(htmlfile);


  domsource domsource = new domsource(htmldocument);
  streamresult streamresult = new streamresult(outstream);

  transformerfactory factory = transformerfactory.newinstance();
  transformer serializer = factory.newtransformer();
  serializer.setoutputproperty(outputkeys.encoding, "utf-8");
  serializer.setoutputproperty(outputkeys.indent, "yes");
  serializer.setoutputproperty(outputkeys.method, "html");

  serializer.transform(domsource, streamresult);
  outstream.close();

  system.out.println("生成html文件路径:"+ "http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
 }

 /**
  * 2007版本word转换成html
  * @throws ioexception
  */
 public void word2007tohtml(string filename) throws ioexception {

  final string strranstring=getrandomnum();

  string filepath = storagepath+strranstring;
  string htmlname =filename.substring(0, filename.indexof("."))+ "2007.html";
  file f = new file(storagepath+filename);
  if (!f.exists()) {
   system.out.println("sorry file does not exists!");
  } else {
   if (f.getname().endswith(".docx") || f.getname().endswith(".docx")) {
    try {
     // 1) 加载word文档生成 xwpfdocument对象
     inputstream in = new fileinputstream(f);
     xwpfdocument document = new xwpfdocument(in);

     // 2) 解析 xhtml配置 (这里设置iuriresolver来设置图片存放的目录)
     file imagefolderfile = new file(filepath);
     xhtmloptions options = xhtmloptions.create().uriresolver(new fileuriresolver(imagefolderfile));
     options.setextractor(new fileimageextractor(imagefolderfile));
     options.uriresolver(new iuriresolver() {
      public string resolve(string uri) {
       //http://192.168.30.222:8010//uploadfile/....
       return "http://"+ip+":"+port+"//uploadfile/"+strranstring +"/"+ uri;
      }
     });

     options.setignorestylesifunused(false);
     options.setfragment(true);

     // 3) 将 xwpfdocument转换成xhtml
     outputstream out = new fileoutputstream(new file(filepath + htmlname));
     ixwpfconverter<xhtmloptions> converter = xhtmlconverter.getinstance();
     converter.convert(document,out, options);
     //xhtmlconverter.getinstance().convert(document, out, options);
     system.out.println("html路径:"+"http://"+ip+":"+port+"//uploadfile/"+strranstring+htmlname);
    } catch (exception e) {
     e.printstacktrace();
    }

   } else {
    system.out.println("enter only ms office 2007+ files");
   }
  }
 }

  /**
  *功能说明:生成时间戳
  *创建人:zsq
  *创建时间:2019年12月7日 下午2:37:09
  *
  */
  public static string getrandomnum(){
   date dt = new date();
   simpledateformat sdf = new simpledateformat("yyyymmddhhmmss");
   string str=sdf.format(dt);
   return str;
  }

 }

二:java实现将pdf转换为html

1: 引入依赖

<dependency>
   <groupid>net.sf.cssbox</groupid>
   <artifactid>pdf2dom</artifactid>
   <version>1.7</version>
  </dependency>
  <dependency>
   <groupid>org.apache.pdfbox</groupid>
   <artifactid>pdfbox</artifactid>
   <version>2.0.12</version>
  </dependency>
  <dependency>
   <groupid>org.apache.pdfbox</groupid>
   <artifactid>pdfbox-tools</artifactid>
   <version>2.0.12</version>
 </dependency>

2:代码demo

 public class pdftohtml {
 
 /*
  pdf转换html
  */
  public void pdftohtmltest(string inpdfpath,string outputhtmlpath) {
   // string outputpath = "c:\\works\\files\\zsq保密知识测试题库.html";
    //try() 写在()里面会自动关闭流
   try{
    bufferedwriter out = new bufferedwriter(new outputstreamwriter(new fileoutputstream(new file(outputhtmlpath)),"utf-8"));
    //加载pdf文档
    //pddocument document = pddocument.load(bytes);
    pddocument document = pddocument.load(new file(inpdfpath));
    pdfdomtree pdfdomtree = new pdfdomtree();
    pdfdomtree.writetext(document,out);
   } catch (exception e) {
    e.printstacktrace();
   }
  }
 
  public static void main(string[] args) throws ioexception {
   pdftohtml ph=new pdftohtml();
   string pdfpath="c:\\works\\files\\武研中心行政考勤制度.pdf";
   string outputpath="c:\\works\\files\\武研中心行政考勤制度.html";
   ph.pdftohtmltest(pdfpath,outputpath);
 }
 
 }

三:java实现将txt转换为html

/*
  * txt文档转html
  filepath:txt原文件路径
  htmlposition:转化后生成的html路径
 */
 public static void txttohtml(string filepath, string htmlposition) {
  try {
   //string encoding = "gbk";
   file file = new file(filepath);
   if (file.isfile() && file.exists()) { // 判断文件是否存在
    inputstreamreader read = new inputstreamreader(new fileinputstream(file), "gbk");
    // 考虑到编码格式
    bufferedreader bufferedreader = new bufferedreader(read);
    // 写文件
    fileoutputstream fos = new fileoutputstream(new file(htmlposition));
    outputstreamwriter osw = new outputstreamwriter(fos, "gbk");
    bufferedwriter bw = new bufferedwriter(osw);
    string linetxt = null;
    while ((linetxt = bufferedreader.readline()) != null) {
     bw.write("&nbsp&nbsp&nbsp"+linetxt + "</br>");
    }
    bw.close();
    osw.close();
    fos.close();
    read.close();
   } else {
    system.out.println("找不到指定的文件");
   }
  } catch (exception e) {
   system.out.println("读取文件内容出错");
   e.printstacktrace();
  }
 }

以上就是java实现word/pdf/txt转html的示例的详细内容,更多关于java word/pdf/txt转html的资料请关注其它相关文章!