欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

word转HTML 基本版

程序员文章站 2022-04-09 22:12:07
...

同时支持doc和docx,话不多说,直接上代码
项目依赖 pom.xml

<dependency>
    <groupId>commons-lang</groupId>
    <artifactId>commons-lang</artifactId>
    <version>2.6</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
    <version>1.0.6</version>
</dependency>
<dependency>
    <groupId>net.sf.jtidy</groupId>
    <artifactId>jtidy</artifactId>
    <version>r938</version>
</dependency>

正式代码是

package com.zbj.spring.boot.util;

import lombok.Cleanup;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;

/**
 * WordToHtml
 *
 * @author weigang
 * @create 2017-10-13
 **/
public class WordToHtml {

    public static void main(String[] args) {
//      String sourceFileName = "D:/test/员工劳动合同.docx";
        String sourceFileName = "D:/test/员工劳动合同.doc";
        try {
            convertWordToHtml(sourceFileName);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerException e) {
            e.printStackTrace();
        }
    }

    public static void convertWordToHtml(String path) throws IOException, ParserConfigurationException, TransformerException {

        String htmlPath = "D:/test/index.html";
        String imagePath = "D:/test/image/";

        if (StringUtils.isBlank(path)) {
            return;
        }

        if (path.endsWith("docx")) { // 2007 及以后

            XWPFDocument document = new XWPFDocument(new FileInputStream(path));
            XHTMLOptions options = XHTMLOptions.create();
            options.setExtractor(new FileImageExtractor(new File(imagePath)));
            options.URIResolver(new BasicURIResolver("image"));

            @Cleanup OutputStreamWriter streamWriter = new OutputStreamWriter(new FileOutputStream(htmlPath));
            XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
            xhtmlConverter.convert(document, streamWriter, options);
        } else { // 2003 之前

            HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(path));
            Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
            // 保存图片,并返回图片的相对路径
            wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
                File imageFile = new File(imagePath);
                if(!imageFile.exists()){
                    imageFile.mkdirs();
                }
                try (FileOutputStream out = new FileOutputStream(imagePath + name)) {
                    out.write(content);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                return "image/" + name;
            });
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(new File(htmlPath));

            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
        }
    }
}

对于word文件,在网上随便下载个合同或者文件即可

相关标签: html word 转换