使用POI读写PowerPoint文件(兼容ppt与pptx版本) javaofficepowerpointpoi
程序员文章站
2022-06-03 15:12:51
...
调用示例:
File powerPointFile = new File("D:\\temp.ppt"); //读取PowerPoint文档中所有文本内容,以字符串形式返回 System.out.println(PowerPointFileUtil.extractTextFromPowerPointFile(powerPointFile , "," , ";"));
工具类源码:
/** * BasePowerPointFileUtil.java * Copyright ® 2017 窦海宁 * All right reserved */ package org.aiyu.core.common.util.file.office; import java.util.ArrayList; import java.util.List; import org.apache.poi.sl.usermodel.AutoShape; import org.apache.poi.sl.usermodel.Shape; import org.apache.poi.sl.usermodel.Slide; import org.apache.poi.sl.usermodel.SlideShow; /** * <p>PowerPoint文件工具基类 * * <p>通用的PowerPoint文件工具基类,可用于从PowerPoint文档中抽取文本信息 * * @author 窦海宁, chong0660@sina.com * @since AiyuCommonCore-1.0 * @version AiyuCommonCore-1.0 */ public abstract class BasePowerPointFileUtil { /** * <p>读取PowerPoint文件中的幻灯片对象 * * @param slideShow SlideShow对象 * * @return 读取出的工作薄列表 * * @modify 窦海宁, 2017-01-18 */ protected static List readSlideShow(SlideShow slideShow) { List slideList = null; if (slideShow != null) { slideList = new ArrayList(); List slides = slideShow.getSlides(); for (int i = 0 ; i < slides.size() ; i++) { slideList.add(BasePowerPointFileUtil.readSlide((Slide) slides.get(i))); } } return slideList; } /** * <p>读取指定的Slide中的数据 * * @param slide Slide对象 * * @return 读取出的Slide数据列表 * * @modify 窦海宁, 2017-01-18 */ protected static List readSlide(Slide slide) { List shapeList = null; if (slide != null) { shapeList = new ArrayList(); List shapes = slide.getShapes(); for (int i = 0 ; i < shapes.size() ; i++) { shapeList.add(BasePowerPointFileUtil.readShape((Shape) shapes.get(i))); } } return shapeList; } /** * <p>读取指定的图形的数据 * * @param shape Slide中的图形对象 * * @return 读取出的图形数据 * * @modify 窦海宁, 2017-01-18 */ protected static Object readShape(Shape shape) { String returnValue = null; if (shape != null) { if (shape instanceof AutoShape) { try { returnValue = ((AutoShape) shape).getText(); } catch (Exception ex) { ex.printStackTrace(); } } } return returnValue; } }
PowerPoint2003版本工具类:
/** * PowerPoint2003FileUtil.java * Copyright ® 2010 窦海宁 * All right reserved */ package org.aiyu.core.common.util.file.office; import java.io.File; import java.util.Iterator; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl; import org.apache.poi.sl.usermodel.SlideShow; /** * <p>PowerPoint2003版文件工具类 * * <p>通用的PowerPoint2003版文件工具类,可用于从PowerPoint文档中抽取文本信息 * * @author 窦海宁, chong0660@sina.com * @since AiyuCommonCore-1.0 * @version AiyuCommonCore-1.0 */ public abstract class PowerPoint2003FileUtil extends BasePowerPointFileUtil { /** * <p>从PowerPoint文档中提取文本信息 * * @param powerPointFile PowerPoint文件 * @param shapeSeparator Shape分隔符 * @param slideSeparator Slide分隔符 * * @return 提取后的文本信息 * * @modify 窦海宁, 2017-01-18 */ protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) { StringBuffer returnValue = new StringBuffer(); if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) { if (powerPointFile.isFile()) { try { SlideShow slideShow = new HSLFSlideShow(new HSLFSlideShowImpl(powerPointFile.getCanonicalPath())); Iterator slideIterator = PowerPoint2003FileUtil.readSlideShow(slideShow).iterator(); //遍历Slide while (slideIterator.hasNext()) { Iterator shapeIterator = ((List) slideIterator.next()).iterator(); //遍历Shape while (shapeIterator.hasNext()) { Object shapeValue = shapeIterator.next(); if (shapeValue != null) { returnValue.append((String) shapeValue); if (shapeIterator.hasNext()) { returnValue.append(shapeSeparator); } } } if (slideIterator.hasNext()) { returnValue.append(slideSeparator); } } } catch (Exception ex) { ex.printStackTrace(); } } } return StringUtils.trimToNull(returnValue.toString()); } }
PowerPoint2007版本工具类:
/** * PowerPoint2007FileUtil.java * Copyright ® 2017 窦海宁 * All right reserved */ package org.aiyu.core.common.util.file.office; import java.io.File; import java.io.FileInputStream; import java.util.Iterator; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.poi.xslf.usermodel.XMLSlideShow; /** * <p>PowerPoint2007版文件工具类 * * <p>通用的PowerPoint2007版文件工具类,可用于从PowerPoint文档中抽取文本信息 * * @author 窦海宁, chong0660@sina.com * @since AiyuCommonCore-1.0 * @version AiyuCommonCore-1.0 */ public abstract class PowerPoint2007FileUtil extends BasePowerPointFileUtil { /** * <p>从PowerPoint文档中提取文本信息 * * @param powerPointFile PowerPoint文件 * @param shapeSeparator Shape分隔符 * @param slideSeparator Slide分隔符 * * @return 提取后的文本信息 * * @modify 窦海宁, 2017-01-18 */ protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) { StringBuffer returnValue = new StringBuffer(); if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) { if (powerPointFile.isFile()) { try { XMLSlideShow slideShow = new XMLSlideShow(new FileInputStream(powerPointFile)); Iterator slideIterator = PowerPoint2007FileUtil.readSlideShow(slideShow).iterator(); //遍历Slide while (slideIterator.hasNext()) { Iterator shapeIterator = ((List) slideIterator.next()).iterator(); //遍历Shape while (shapeIterator.hasNext()) { Object shapeValue = shapeIterator.next(); if (shapeValue != null) { returnValue.append((String) shapeValue); if (shapeIterator.hasNext()) { returnValue.append(shapeSeparator); } } } if (slideIterator.hasNext()) { returnValue.append(slideSeparator); } } } catch (Exception ex) { ex.printStackTrace(); } } } return StringUtils.trimToNull(returnValue.toString()); } }
统一调用工具类:
/** * PowerPointFileUtil.java * Copyright ® 2017 窦海宁 * All right reserved */ package org.aiyu.core.common.util.file.office; import java.io.File; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; /** * <p>PowerPoint文件工具类 * * <p>通用的PowerPoint文件工具类,可用于从PowerPoint文档中抽取文本信息 * * @author 窦海宁, chong0660@sina.com * @since AiyuCommonCore-1.0 * @version AiyuCommonCore-1.0 */ public abstract class PowerPointFileUtil extends BasePowerPointFileUtil { /** * <p>从PowerPoint文档中提取文本信息 * * @param powerPointFile PowerPoint文件 * @param shapeSeparator Shape分隔符 * @param slideSeparator Slide分隔符 * * @return 提取后的文本信息 * * @modify 窦海宁, 2017-02-06 */ public static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) { String resultText = null; if (powerPointFile != null && powerPointFile.exists()) { String extension = FilenameUtils.getExtension(powerPointFile.getName()); if (StringUtils.equalsIgnoreCase("ppt" , extension)) { //Office2003版文件处理 resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator); } else if (StringUtils.equalsIgnoreCase("pptx" , extension)) { //Office2007版文件处理 resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator); } else { //文件类型有误 } } return resultText; } }
统一调用工具类通过文件扩展名(PPT与PPTX,不区分大小写)判断文件版本,暂时没有想到更好的办法;本工具类使用POI_3.15实现,无须目标机器安装OFFICE软件也可进行文件读写。
上一篇: C#中的局部变量冲突问题
下一篇: C#关键字async/await用法