java poi sax方式处理大数据量excel文件
程序员文章站
2023-11-13 15:50:16
系统需要用到一个导入excel文件的功能,使用poi组件常规方式读取excel时,内存耗尽,outofmemoryerror,或者读取非常慢
所以写了一个工具类,使用poi sax方...
系统需要用到一个导入excel文件的功能,使用poi组件常规方式读取excel时,内存耗尽,outofmemoryerror,或者读取非常慢
所以写了一个工具类,使用poi sax方式读取excel,速度快很多,内存消耗可以接受。
测试结果如下:
.xlsx文件,35m大小,总4个sheel,
只读取第一个,37434行,54列
总行数:37434
读取耗时:39秒
打印耗时:17秒
主要代码如下:
excelutils.class 主入口
package com.xxx.bi.utils.excel; import java.util.list; import java.util.objects; import org.apache.commons.lang3.stringutils; import com.google.common.collect.lists; public class excelutils { /** logger日志. */ // public static final logger logger = logger.getlogger(excelutils.class); public excelutils() { } /** * 获取excel的表头 * * @param filepath * 文件路径 * @param headernum * 表头所在行数 * @return */ public static list<string> getheader(string filepath, int headernum) { if (stringutils.isblank(filepath)) { throw new illegalargumentexception("传入文件路径不能为空"); } if (objects.isnull(headernum) || headernum < 1) { headernum = 1; } try { return largeexcelfilereadutil.getrowfromsheetone(filepath, headernum); } catch (exception e) { // logger.info("获取excel[" + filepath + "]表头失败,原因:", e); e.printstacktrace(); } return lists.newarraylist(); } /** * 获取excel的所有数据<br/> * 所有数据类型都是string<br/> * 会以第一行数据的列数为总列数,所以第一行的数据必须都不为空,否则可能出java.lang.indexoutofboundsexception * * @param filepath * 文件路径 * @param headernum * 表头所在行数 * @return */ public static list<list<string>> getalldata(string filepath) { if (stringutils.isblank(filepath)) { throw new illegalargumentexception("传入文件路径不能为空"); } try { return largeexcelfilereadutil.getrowsfromsheetone(filepath); } catch (exception e) { // logger.info("获取excel[" + filepath + "]表头失败,原因:", e); e.printstacktrace(); } return lists.newarraylist(); } public static void main(string[] args) { long start = system.currenttimemillis(); string filepath = "c:/users/administrator/desktop/05-作业调配表 -快递.xlsx"; // list<string> result = excelutils.getheader(filepath, 1); // for (string col : result) { // system.out.println(col); // } list<list<string>> result = excelutils.getalldata(filepath); long end = system.currenttimemillis(); for (list<string> list : result) { system.out.println(list.tostring()); } long end1 = system.currenttimemillis(); try { thread.sleep(1000l); } catch (interruptedexception e) { e.printstacktrace(); } system.err.println("总行数:" + result.size()); system.err.println(("读取耗时:" + (end - start) / 1000) + "秒"); system.err.println(("打印耗时:" + (end1 - end) / 1000) + "秒"); } }
largeexcelfilereadutil.class 真正的工具类
package com.xxx.bi.utils.excel; import java.io.inputstream; import java.util.list; import java.util.objects; import org.apache.log4j.logger; import org.apache.poi.openxml4j.opc.opcpackage; import org.apache.poi.xssf.eventusermodel.xssfreader; import org.apache.poi.xssf.model.sharedstringstable; import org.xml.sax.inputsource; import org.xml.sax.xmlreader; import org.xml.sax.helpers.xmlreaderfactory; public class largeexcelfilereadutil { /** logger日志. */ public static final logger logger = logger.getlogger(largeexcelfilereadutil.class); // 处理一个sheet public static list<string> getrowfromsheetone(string filename, integer rownum) throws exception { inputstream inputstream = null; opcpackage pkg = null; singlerowhandler singlerowhandler = null; try { pkg = opcpackage.open(filename); xssfreader r = new xssfreader(pkg); sharedstringstable sst = r.getsharedstringstable(); singlerowhandler = new singlerowhandler(sst, rownum); xmlreader parser = xmlreaderfactory.createxmlreader("com.sun.org.apache.xerces.internal.parsers.saxparser"); parser.setcontenthandler(singlerowhandler); inputstream = r.getsheet("rid1"); inputsource sheetsource = new inputsource(inputstream); parser.parse(sheetsource); return singlerowhandler.getrow(); } catch (exception e) { string message = e.getmessage(); if (objects.nonnull(rownum) && objects.nonnull(singlerowhandler) && singlerowhandler.finish_row_message.equalsignorecase(message)) { // 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的 return singlerowhandler.getrow(); } throw e; } finally { if (objects.nonnull(pkg)) { pkg.close(); } if (objects.nonnull(inputstream)) { inputstream.close(); } } } // 处理一个sheet public static list<list<string>> getrowsfromsheetone(string filename) throws exception { inputstream inputstream = null; opcpackage pkg = null; multirowhandler multirowhandler = null; try { pkg = opcpackage.open(filename); xssfreader r = new xssfreader(pkg); sharedstringstable sst = r.getsharedstringstable(); multirowhandler = new multirowhandler(sst); xmlreader parser = xmlreaderfactory.createxmlreader("com.sun.org.apache.xerces.internal.parsers.saxparser"); parser.setcontenthandler(multirowhandler); inputstream = r.getsheet("rid1"); inputsource sheetsource = new inputsource(inputstream); parser.parse(sheetsource); return multirowhandler.getrows(); } catch (exception e) { throw e; } finally { if (objects.nonnull(pkg)) { pkg.close(); } if (objects.nonnull(inputstream)) { inputstream.close(); } } } }
singlerowhandler.class 当行处理类,可以只获取表头或表格中的某一行数据
package com.xxx.bi.utils.excel; import java.util.arraylist; import java.util.list; import java.util.objects; import java.util.regex.pattern; import org.apache.poi.xssf.model.sharedstringstable; import org.apache.poi.xssf.usermodel.xssfrichtextstring; import org.xml.sax.attributes; import org.xml.sax.saxexception; import org.xml.sax.helpers.defaulthandler; public class singlerowhandler extends defaulthandler { public final static string finish_row_message = "row data process finish"; private integer rownum = null;// rownum不为空时则标示只需要获取这一行的数据 private int currownum = 1; private string celltype = ""; private sharedstringstable sst; private string lastcontents; private boolean nextisstring; private string cellposition; private list<string> row = new arraylist<>(); public list<string> getrow() { return row; } public singlerowhandler(sharedstringstable sst, integer rownum) { this.sst = sst; this.rownum = rownum; } public void startelement(string uri, string localname, string name, attributes attributes) throws saxexception { if (name.equals("c")) { cellposition = attributes.getvalue("r"); // 这是一个新行 if (pattern.compile("^a[0-9]+$").matcher(cellposition).find()) { currownum = integer.valueof(cellposition.substring(1)); } celltype = ""; celltype = attributes.getvalue("t"); if ("s".equals(celltype)) { nextisstring = true; } else { nextisstring = false; } } // 清楚缓存内容 lastcontents = ""; if (objects.nonnull(rownum) && currownum > rownum) { // 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的 throw new saxexception(finish_row_message); } } public void endelement(string uri, string localname, string name) throws saxexception { if (nextisstring) { int idx = integer.parseint(lastcontents); lastcontents = new xssfrichtextstring(sst.getentryat(idx)).tostring(); nextisstring = false; } if (name.equals("v")) { if (objects.isnull(rownum) || rownum == currownum) { row.add(lastcontents); } } } public void characters(char[] ch, int start, int length) throws saxexception { lastcontents += new string(ch, start, length); } }
multirowhandler.class 获取excel所有行的数据
package com.xxx.bi.utils.excel; import java.util.arraylist; import java.util.list; import java.util.objects; import java.util.regex.pattern; import org.apache.commons.lang3.stringutils; import org.apache.poi.xssf.model.sharedstringstable; import org.apache.poi.xssf.usermodel.xssfrichtextstring; import org.xml.sax.attributes; import org.xml.sax.saxexception; import org.xml.sax.helpers.defaulthandler; /** * 获取完整excel数据的handler<br/> * * @author administrator * */ public class multirowhandler extends defaulthandler { private int currownum = 0;// 行号,从1开始 private int curcolindex = -1;// 列索引,从0开始 private int colcnt = 0;// 列数,取第一行列数做为列总数 private string celltype = ""; private sharedstringstable sst; private string lastcontents; private boolean nextisstring; private string cellposition; private list<string> head = null; private list<string> currowdata = null; private boolean currowisblank = true;// 当前是个空行 private list<list<string>> rows = new arraylist<>(); public list<list<string>> getrows() { return rows; } public multirowhandler(sharedstringstable sst) { this.sst = sst; } @override public void startelement(string uri, string localname, string name, attributes attributes) throws saxexception { if (name.equals("c")) { cellposition = attributes.getvalue("r"); curcolindex = getcolindex(cellposition); // 这是一个新行 if (isnewrow(cellposition)) { currownum = getrownum(cellposition); if (2 == currownum && objects.nonnull(currowdata)) { head = currowdata; colcnt = head.size(); } currowdata = getblankrow(colcnt); } celltype = ""; celltype = attributes.getvalue("t"); if ("s".equals(celltype)) { nextisstring = true; } else { nextisstring = false; } } // 清楚缓存内容 lastcontents = ""; } private boolean isnewrow(string cellposition) { // 坐标以a开头,后面跟数字 或者坐标行和当前行不一致的 boolean newrow = pattern.compile("^a[0-9]+$").matcher(cellposition).find(); if (!newrow) { int cellrownum = getrownum(cellposition); newrow = (cellrownum != currownum); } return newrow; } /** * 根据列坐标获取行号,从1开始,返回0时标示出错 * * @param cellposition * 列坐标,为a1,b23等 * @return 行号,从1开始,返回0是为失败 */ private static int getrownum(string cellposition) { string strval = pattern.compile("[^0-9]").matcher(cellposition).replaceall("").trim();// 获取坐标中的数字 if (stringutils.isnotblank(strval)) { return integer.valueof(strval); } return 0; } /** * 根据列坐标返回当前列索引,从0开始,返回-1时标示出错<br/> * a1->0; b1->1...aa1->26 * * @param cellposition * 列坐标,为a1,b23等 * @return 列索引,从0开始,返回-1是为失败,a1->0; b1->1...aa1->26 */ private static int getcolindex(string cellposition) { int index = -1; int num = 65;// a的unicode码 int length = cellposition.length(); for (int i = 0; i < length; i++) { char c = cellposition.charat(i); if (character.isdigit(c)) { break;// 确定指定的char值是否为数字 } index = (index + 1) * 26 + (int) c - num; } return index; } /** * 返回一个全部为空字符串的空行 * * @param cnt * @return */ private list<string> getblankrow(int cnt) { list<string> result = new arraylist<>(cnt); for (int i = 0; i < cnt; i++) { result.add(i, ""); } currowisblank = true; return result; } @override public void endelement(string uri, string localname, string name) throws saxexception { if (nextisstring) { int idx = integer.parseint(lastcontents); lastcontents = new xssfrichtextstring(sst.getentryat(idx)).tostring(); nextisstring = false; } if (name.equals("v")) { // system.out.println(messageformat.format("当前列定位:{0},当前行:{1},当前列:{2},当前值:{3}", // cellposition, currownum, // curcolindex, lastcontents)); if (objects.isnull(head)) { currowdata.add(lastcontents); } else { currowdata.set(curcolindex, lastcontents); } currowisblank = false; // 这是一个新行 if (isnewrow(cellposition)) { if (objects.nonnull(currowdata)) { if (currowisblank) { currowdata.clear();// 如果当前行是空行,则清空当前行数据 } rows.add(currowdata); } } } } @override public void enddocument() throws saxexception { if (objects.nonnull(currowdata) && !currowisblank) { rows.add(currowdata);// 最后一行在上面不好加入,最后一行全是空行的不加入 } super.enddocument(); } @override public void characters(char[] ch, int start, int length) throws saxexception { lastcontents += new string(ch, start, length); } @override public void ignorablewhitespace(char[] ch, int start, int length) throws saxexception { lastcontents += ""; } public static void main(string[] args) { system.out.println(getcolindex("bc2")); } }
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。