POI处理大容量Excel解决方案 - 含03 07版
程序员文章站
2022-07-13 13:16:25
...
前言
网上关于POI处理大文件的方案有很多,但是都比较古老,我集中整理了一下,可以直接copy到项目中使用。关于小数等精度问题也处理好了,大家使用自己注意下就好。目前我配置2G内存,支持100m左右的excel,不会内存溢出,体验良好
上代码
先写接口
/**
* <code>BigDataParseExcel</code>
*
* @see
* @since 2019/4/15 v1.0
*/
public interface BigDataParseExcel {
abstract void optRows(int sheetIndex, int curRow,
List<String> rowList);
}
07版的抽象类
/**
* <code>BigDataParseExcelUtil</code>
* <p>
* XSSF and SAX (Event API)
* </p>
*
* @author daguozb
* @link https://www.cnblogs.com/daguozb/p/10031970.html
* @see
* @since 2019/4/15 v1.0
*/
/**
* @author y
* @create 2018-01-18 14:28
* @desc POI读取excel有两种模式,一种是用户模式,一种是事件驱动模式
* 采用SAX事件驱动模式解决XLSX文件,可以有效解决用户模式内存溢出的问题,
* 该模式是POI官方推荐的读取大数据的模式,
* 在用户模式下,数据量较大,Sheet较多,或者是有很多无用的空行的情况下,容易出现内存溢出
* <p>
* 用于解决.xlsx2007版本大数据量问题
**/
public abstract class BigDataParseExcelUtil extends DefaultHandler implements BigDataParseExcel {
/**
* 单元格中的数据可能的数据类型
*/
enum CellDataType {
BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
}
/**
* 共享字符串表
*/
private SharedStringsTable sst;
/**
* 上一次的索引值
*/
private String lastIndex;
/**
* 文件的绝对路径
*/
private String filePath = "";
/**
* 工作表索引
*/
private int sheetIndex = 0;
/**
* sheet名
*/
private String sheetName = "";
/**
* 总行数
*/
private int totalRows=0;
/**
* 一行内cell集合
*/
private List<String> cellList = new ArrayList<>();
/**
* 判断整行是否为空行的标记
*/
private boolean flag = false;
/**
* 当前行
*/
private int curRow = 1;
/**
* 当前列
*/
private int curCol = 0;
/**
* T元素标识
*/
private boolean isTElement;
/**
* 单元格数据类型,默认为字符串类型
*/
private CellDataType nextDataType = CellDataType.SSTINDEX;
private final DataFormatter formatter = new DataFormatter();
/**
* 单元格日期格式的索引
*/
private short formatIndex;
/**
* 日期格式字符串
*/
private String formatString;
//定义前一个元素和当前元素的位置,用来计算其中空的单元格数量,如A6和A8等
private String preRef = null, ref = null;
//定义该文档一行最大的单元格数,用来补全一行最后可能缺失的单元格
private String maxRef = null;
/**
* 单元格
*/
private StylesTable stylesTable;
/**
* 遍历工作簿中所有的电子表格
* 并缓存在mySheetList中
*
* @throws Exception
*/
public int process(OPCPackage pkg) throws Exception {
XSSFReader xssfReader = new XSSFReader(pkg);
stylesTable = xssfReader.getStylesTable();
SharedStringsTable sst = xssfReader.getSharedStringsTable();
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (sheets.hasNext()) { //遍历sheet
curRow = 1; //标记初始行为第一行
sheetIndex++;
InputStream sheet = sheets.next(); //sheets.next()和sheets.getSheetName()不能换位置,否则sheetName报错
sheetName = sheets.getSheetName();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource); //解析excel的每条记录,在这个过程中startElement()、characters()、endElement()这三个函数会依次执行
sheet.close();
}
return totalRows; //返回该excel文件的总行数,不包括首列和空行
}
public void readExcel(String fileName) throws Exception {
process(OPCPackage.open(fileName));
}
public void readExcel(InputStream inputStream) throws Exception {
process(OPCPackage.open(inputStream));
}
/**
* 第一个执行
*
* @param uri
* @param localName
* @param name
* @param attributes
* @throws SAXException
*/
@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
//c => 单元格
if ("c".equals(name)) {
//前一个单元格的位置
if (preRef == null) {
preRef = attributes.getValue("r");
} else {
preRef = ref;
}
//当前单元格的位置
ref = attributes.getValue("r");
//设定单元格类型
this.setNextDataType(attributes);
}
//当元素为t时
if ("t".equals(name)) {
isTElement = true;
} else {
isTElement = false;
}
//置空
lastIndex = "";
}
/**
* 第二个执行
* 得到单元格对应的索引值或是内容值
* 如果单元格类型是字符串、INLINESTR、数字、日期,lastIndex则是索引值
* 如果单元格类型是布尔值、错误、公式,lastIndex则是内容值
* @param ch
* @param start
* @param length
* @throws SAXException
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
lastIndex += new String(ch, start, length);
}
/**
* 第三个执行
*
* @param uri
* @param localName
* @param name
* @throws SAXException
*/
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
//t元素也包含字符串
if (isTElement) {//这个程序没经过
//将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
String value = lastIndex.trim();
cellList.add(curCol, value);
curCol++;
isTElement = false;
//如果里面某个单元格含有值,则标识该行不为空行
if (value != null && !"".equals(value)) {
flag = true;
}
} else if ("v".equals(name)) {
//v => 单元格的值,如果单元格是字符串,则v标签的值为该字符串在SST中的索引
String value = this.getDataValue(lastIndex.trim(), "");//根据索引值获取对应的单元格值
//补全单元格之间的空单元格
if (!ref.equals(preRef)) {
int len = countNullCell(ref, preRef);
for (int i = 0; i < len; i++) {
cellList.add(curCol, "");
curCol++;
}
}
cellList.add(curCol, value);
curCol++;
//如果里面某个单元格含有值,则标识该行不为空行
if (value != null && !"".equals(value)) {
flag = true;
}
} else {
//如果标签名称为row,这说明已到行尾,调用optRows()方法
if ("row".equals(name)) {
//默认第一行为表头,以该行单元格数目为最大数目
if (curRow == 1) {
maxRef = ref;
}
//补全一行尾部可能缺失的单元格
if (maxRef != null) {
int len = countNullCell(maxRef, ref);
for (int i = 0; i <= len; i++) {
cellList.add(curCol, "");
curCol++;
}
}
//该行不为空行且该行不是第一行,则发送(第一行为列名,不需要)
if (flag&&curRow!=1){
optRows(sheetIndex, curRow, cellList);
totalRows++;
}
cellList.clear();
curRow++;
curCol = 0;
preRef = null;
ref = null;
flag=false;
}
}
}
/**
* 处理数据类型
*
* @param attributes
*/
public void setNextDataType(Attributes attributes) {
nextDataType = CellDataType.NUMBER; //cellType为空,则表示该单元格类型为数字
formatIndex = -1;
formatString = null;
String cellType = attributes.getValue("t"); //单元格类型
String cellStyleStr = attributes.getValue("s"); //
String columnData = attributes.getValue("r"); //获取单元格的位置,如A1,B1
if ("b".equals(cellType)) { //处理布尔值
nextDataType = CellDataType.BOOL;
} else if ("e".equals(cellType)) { //处理错误
nextDataType = CellDataType.ERROR;
} else if ("inlineStr".equals(cellType)) {
nextDataType = CellDataType.INLINESTR;
} else if ("s".equals(cellType)) { //处理字符串
nextDataType = CellDataType.SSTINDEX;
} else if ("str".equals(cellType)) {
nextDataType = CellDataType.FORMULA;
}
if (cellStyleStr != null) { //处理日期
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
formatIndex = style.getDataFormat();
formatString = style.getDataFormatString();
if (formatString.contains("m/d/yy")) {
nextDataType = CellDataType.DATE;
formatString = "yyyy-MM-dd hh:mm:ss";
}
if (formatString == null) {
nextDataType = CellDataType.NULL;
formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
}
}
}
/**
* 对解析出来的数据进行类型处理
* @param value 单元格的值,
* value代表解析:BOOL的为0或1, ERROR的为内容值,FORMULA的为内容值,INLINESTR的为索引值需转换为内容值,
* SSTINDEX的为索引值需转换为内容值, NUMBER为内容值,DATE为内容值
* @param thisStr 一个空字符串
* @return
*/
@SuppressWarnings("deprecation")
public String getDataValue(String value, String thisStr) {
switch (nextDataType) {
// 这几个的顺序不能随便交换,交换了很可能会导致数据错误
case BOOL: //布尔值
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR: //错误
thisStr = "\"ERROR:" + value.toString() + '"';
break;
case FORMULA: //公式
thisStr = '"' + value.toString() + '"';
break;
case INLINESTR:
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
thisStr = rtsi.toString();
rtsi = null;
break;
case SSTINDEX: //字符串
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));//根据idx索引值获取内容值
thisStr = rtss.toString();
rtss = null;
} catch (NumberFormatException ex) {
thisStr = value.toString();
}
break;
case NUMBER: //数字
if (formatString != null) {
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
} else {
thisStr = value;
}
thisStr = thisStr.replace("_", "").trim();
break;
case DATE: //日期
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
// 对日期字符串作特殊处理,去掉T
thisStr = thisStr.replace("T", " ");
break;
default:
thisStr = " ";
break;
}
return thisStr;
}
public int countNullCell(String ref, String preRef) {
//excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
String xfd = ref.replaceAll("\\d+", "");
String xfd_1 = preRef.replaceAll("\\d+", "");
xfd = fillChar(xfd, 3, '@', true);
xfd_1 = fillChar(xfd_1, 3, '@', true);
char[] letter = xfd.toCharArray();
char[] letter_1 = xfd_1.toCharArray();
int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]);
return res - 1;
}
public String fillChar(String str, int len, char let, boolean isPre) {
int len_1 = str.length();
if (len_1 < len) {
if (isPre) {
for (int i = 0; i < (len - len_1); i++) {
str = let + str;
}
} else {
for (int i = 0; i < (len - len_1); i++) {
str = str + let;
}
}
}
return str;
}
// 测试使用 可以改为main
public static void main(String[] args) throws Exception {
long start = System.currentTimeMillis();
ExcelXlsReader xlx = new ExcelXlsReader() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowList) {
System.out.println("rowList = " + rowList);
}
};
xlx.readExcel("C:\\Users\\Administrator\\Desktop\\test-03.xls");
long end = System.currentTimeMillis();
System.out.println((end - start) / 1000);
}
}
03版的抽象类
/**
* @author y
* @create 2018-01-19 14:18
* @desc 用于解决.xls2003版本大数据量问题
**/
public abstract class ExcelXlsReader implements HSSFListener,BigDataParseExcel {
private int minColums = -1;
private POIFSFileSystem fs;
/**
* 总行数
*/
private int totalRows=0;
/**
* 上一行row的序号
*/
private int lastRowNumber;
/**
* 上一单元格的序号
*/
private int lastColumnNumber;
/**
* 是否输出formula,还是它对应的值
*/
private boolean outputFormulaValues = true;
/**
* 用于转换formulas
*/
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
//excel2003工作簿
private HSSFWorkbook stubWorkbook;
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
private final HSSFDataFormatter formatter = new HSSFDataFormatter();
/**
* 文件的绝对路径
*/
private String filePath = "";
//表索引
private int sheetIndex = 0;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private ArrayList boundSheetRecords = new ArrayList();
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
//当前行
private int curRow = 0;
//存储一行记录所有单元格的容器
private List<String> cellList = new ArrayList<String>();
/**
* 判断整行是否为空行的标记
*/
private boolean flag = false;
@SuppressWarnings("unused")
private String sheetName;
/**
* 遍历excel下所有的sheet
*
* @throws Exception
*/
public int process(InputStream inputStream) throws Exception {
this.fs = new POIFSFileSystem(inputStream);
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
//返回该excel文件的总行数,不包括首列和空行
return totalRows;
}
public void readExcel(InputStream inputStream) throws Exception {
process(inputStream);
}
public void readExcel(String fileName) throws Exception {
filePath = fileName;
FileInputStream fileInputStream = new FileInputStream(fileName);
process(fileInputStream);
}
/**
* HSSFListener 监听方法,处理Record
* 处理每个单元格
* @param record
*/
@SuppressWarnings("unchecked")
@Override
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid: //开始处理每个sheet
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
//如果有需要,则建立子工作簿
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
sheetIndex++;
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid: //单元格为空白
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
cellList.add(thisColumn, thisStr);
break;
case BoolErrRecord.sid: //单元格为布尔类型
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = berec.getBooleanValue() + "";
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
break;
case FormulaRecord.sid://单元格为公式类型
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
break;
case StringRecord.sid: //单元格中公式的字符串
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
break;
case LabelSSTRecord.sid: //单元格为字符串类型
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
cellList.add(thisColumn, "");
} else {
value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
}
break;
case NumberRecord.sid: //单元格为数字类型
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
//第一种方式
//value = formatListener.formatNumberDateCell(numrec).trim();//这个被写死,采用的m/d/yy h:mm格式,不符合要求
//第二种方式,参照formatNumberDateCell里面的实现方法编写
Double valueDouble=((NumberRecord)numrec).getValue();
String formatString=formatListener.getFormatString(numrec);
if (formatString.contains("m/d/yy")){
formatString="yyyy-MM-dd hh:mm:ss";
}
int formatIndex=formatListener.getFormatIndex(numrec);
value=formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();
value = value.equals("") ? "" : value;
//向容器加入列值
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
break;
default:
break;
}
//遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
//空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
cellList.add(thisColumn, "");
}
//更新行和列的值
if (thisRow > -1) {
lastRowNumber = thisRow;
}
if (thisColumn > -1) {
lastColumnNumber = thisColumn;
}
//行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColums > 0) {
//列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
//该行不为空行且该行不是第一行,发送(第一行为列名,不需要)
if (flag&&curRow!=0) {
optRows(sheetIndex, curRow + 1, cellList); //每行结束时,调用sendRows()方法
totalRows++;
}
//清空容器
cellList.clear();
flag=false;
}
}
/**
* 如果里面某个单元格含有值,则标识该行不为空行
* @param value
*/
public void checkRowIsNull(String value){
if (value != null && !"".equals(value)) {
flag = true;
}
}
}
使用
/**
* <code>BigDataParseExcelAdapter</code>
* 大批量文件导入的适配器,对外部屏蔽实现细节
*
* @author
* @see
* @since 2019/4/15 v1.0
*/
public abstract class BigDataParseExcelAdapter {
//excel2003扩展名
public static final String EXCEL03_EXTENSION = ".xls";
//excel2007扩展名
public static final String EXCEL07_EXTENSION = ".xlsx";
public abstract void dealRows(int sheetIndex, int curRow, List<String> rowList);
public void readExcel(InputStream inputStream) throws Exception{
if (POIFSFileSystem.hasPOIFSHeader(inputStream)){
ExcelXlsReader excelXls= new ExcelXlsReader() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowList) {
dealRows(sheetIndex, curRow, rowList);
}
};
excelXls.readExcel(inputStream);
}else if (POIXMLDocument.hasOOXMLHeader(inputStream)){
BigDataParseExcelUtil bigDataParseExcelUtil = new BigDataParseExcelUtil() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowList) {
dealRows(sheetIndex, curRow, rowList);
}
};
bigDataParseExcelUtil.readExcel(inputStream);
}else{
throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx。");
}
}
public void readExcel(String fileName) throws Exception{
int totalRows =0;
//处理excel2003文件
if (fileName.endsWith(EXCEL03_EXTENSION)) {
ExcelXlsReader excelXls= new ExcelXlsReader() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowList) {
dealRows(sheetIndex, curRow, rowList);
}
};
excelXls.readExcel(fileName);
}else if(fileName.endsWith(EXCEL07_EXTENSION)){
BigDataParseExcelUtil bigDataParseExcelUtil = new BigDataParseExcelUtil() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowList) {
dealRows(sheetIndex, curRow, rowList);
}
};
bigDataParseExcelUtil.readExcel(fileName);
}else{
throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx。");
}
}
//自动识别03 - 07 ,调用不同的进行处理。
public static void main(String[] args) throws Exception {
//03
//07
String path="C:\\Users\\Administrator\\Desktop\\test\\\\test.xlsx";
BigDataParseExcelAdapter bigDataParseExcelAdapter = new BigDataParseExcelAdapter() {
@Override
public void dealRows(int sheetIndex, int curRow, List<String> rowList) {
System.out.println("JSON.toJSONString(rowList) = " + JSON.toJSONString(rowList));
}
};
bigDataParseExcelAdapter.readExcel(path);
}
}
结语
对自己的一个总结,来源于网络和官方,自己做了一些修改,使用更加简便,无侵入。
上一篇: PHPExcel 读取Excel以xlsx格式的文件(转)
下一篇: 记录一次因导出导致的生产事故