java实现查找PDF关键字所在页码及其坐标
程序员文章站
2022-04-16 15:53:36
1、因为最近有这方面的需求,用过之后记录一下。
2、此功能跟pdf中ctrl+f性质一样,如果pdf中为图片形式的不支持定位到关键字。
import com.i...
1、因为最近有这方面的需求,用过之后记录一下。
2、此功能跟pdf中ctrl+f性质一样,如果pdf中为图片形式的不支持定位到关键字。
import com.itextpdf.awt.geom.rectangle2d.float; import com.itextpdf.text.pdf.pdfdictionary; import com.itextpdf.text.pdf.pdfname; import com.itextpdf.text.pdf.pdfreader; import com.itextpdf.text.pdf.parser.*; import java.io.file; import java.io.fileinputstream; import java.io.ioexception; import java.util.arraylist; import java.util.list; /** * 消失的太阳 */ public class mytest { public static void main(string[] args) throws ioexception { //1.给定文件 file pdffile = new file("d://test.pdf"); //2.定义一个byte数组,长度为文件的长度 byte[] pdfdata = new byte[(int) pdffile.length()]; //3.io流读取文件内容到byte数组 fileinputstream inputstream = null; try { inputstream = new fileinputstream(pdffile); inputstream.read(pdfdata); } catch (ioexception e) { throw e; } finally { if (inputstream != null) { try { inputstream.close(); } catch (ioexception e) { } } } //4.指定关键字 string keyword = "消失的太阳:"; //5.调用方法,给定关键字和文件 list<float[]> positions = findkeywordpostions(pdfdata, keyword); //6.返回值类型是 list<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴 system.out.println("total:" + positions.size()); if (positions != null && positions.size() > 0) { for (float[] position : positions) { system.out.print("pagenum: " + (int) position[0]); system.out.print("\tx: " + position[1]); system.out.println("\ty: " + position[2]); } } } /** * findkeywordpostions * @param pdfdata 通过io流 pdf文件转化的byte数组 * @param keyword 关键字 * @return list<float [ ]> : float[0]:pagenum float[1]:x float[2]:y * @throws ioexception */ public static list<float[]> findkeywordpostions(byte[] pdfdata, string keyword) throws ioexception { list<float[]> result = new arraylist<>(); list<pdfpagecontentpositions> pdfpagecontentpositions = getpdfcontentpostionslist(pdfdata); for (pdfpagecontentpositions pdfpagecontentposition : pdfpagecontentpositions) { list<float[]> charpositions = findpositions(keyword, pdfpagecontentposition); if (charpositions == null || charpositions.size() < 1) { continue; } result.addall(charpositions); } return result; } private static list<pdfpagecontentpositions> getpdfcontentpostionslist(byte[] pdfdata) throws ioexception { pdfreader reader = new pdfreader(pdfdata); list<pdfpagecontentpositions> result = new arraylist<>(); int pages = reader.getnumberofpages(); for (int pagenum = 1; pagenum <= pages; pagenum++) { float width = reader.getpagesize(pagenum).getwidth(); float height = reader.getpagesize(pagenum).getheight(); pdfrenderlistener pdfrenderlistener = new pdfrenderlistener(pagenum, width, height); //解析pdf,定位位置 pdfcontentstreamprocessor processor = new pdfcontentstreamprocessor(pdfrenderlistener); pdfdictionary pagedic = reader.getpagen(pagenum); pdfdictionary resourcesdic = pagedic.getasdict(pdfname.resources); try { processor.processcontent(contentbyteutils.getcontentbytesforpage(reader, pagenum), resourcesdic); } catch (ioexception e) { reader.close(); throw e; } string content = pdfrenderlistener.getcontent(); list<charposition> charpositions = pdfrenderlistener.getcharpositions(); list<float[]> positionslist = new arraylist<>(); for (charposition charposition : charpositions) { float[] positions = new float[]{charposition.getpagenum(), charposition.getx(), charposition.gety()}; positionslist.add(positions); } pdfpagecontentpositions pdfpagecontentpositions = new pdfpagecontentpositions(); pdfpagecontentpositions.setcontent(content); pdfpagecontentpositions.setpostions(positionslist); result.add(pdfpagecontentpositions); } reader.close(); return result; } private static list<float[]> findpositions(string keyword, pdfpagecontentpositions pdfpagecontentpositions) { list<float[]> result = new arraylist<>(); string content = pdfpagecontentpositions.getcontent(); list<float[]> charpositions = pdfpagecontentpositions.getpositions(); for (int pos = 0; pos < content.length(); ) { int positionindex = content.indexof(keyword, pos); if (positionindex == -1) { break; } float[] postions = charpositions.get(positionindex); result.add(postions); pos = positionindex + 1; } return result; } private static class pdfpagecontentpositions { private string content; private list<float[]> positions; public string getcontent() { return content; } public void setcontent(string content) { this.content = content; } public list<float[]> getpositions() { return positions; } public void setpostions(list<float[]> positions) { this.positions = positions; } } private static class pdfrenderlistener implements renderlistener { private int pagenum; private float pagewidth; private float pageheight; private stringbuilder contentbuilder = new stringbuilder(); private list<charposition> charpositions = new arraylist<>(); public pdfrenderlistener(int pagenum, float pagewidth, float pageheight) { this.pagenum = pagenum; this.pagewidth = pagewidth; this.pageheight = pageheight; } public void begintextblock() { } public void rendertext(textrenderinfo renderinfo) { list<textrenderinfo> characterrenderinfos = renderinfo.getcharacterrenderinfos(); for (textrenderinfo textrenderinfo : characterrenderinfos) { string word = textrenderinfo.gettext(); if (word.length() > 1) { word = word.substring(word.length() - 1, word.length()); } float rectangle = textrenderinfo.getascentline().getboundingrectange(); float x = (float)rectangle.getx(); float y = (float)rectangle.gety(); // float x = (float)rectangle.getcenterx(); // float y = (float)rectangle.getcentery(); // double x = rectangle.getminx(); // double y = rectangle.getmaxy(); //这两个是关键字在所在页面的xy轴的百分比 float xpercent = math.round(x / pagewidth * 10000) / 10000f; float ypercent = math.round((1 - y / pageheight) * 10000) / 10000f; // charposition charposition = new charposition(pagenum, xpercent, ypercent); charposition charposition = new charposition(pagenum, (float)x, (float)y); charpositions.add(charposition); contentbuilder.append(word); } } public void endtextblock() { } public void renderimage(imagerenderinfo renderinfo) { } public string getcontent() { return contentbuilder.tostring(); } public list<charposition> getcharpositions() { return charpositions; } } private static class charposition { private int pagenum = 0; private float x = 0; private float y = 0; public charposition(int pagenum, float x, float y) { this.pagenum = pagenum; this.x = x; this.y = y; } public int getpagenum() { return pagenum; } public float getx() { return x; } public float gety() { return y; } @override public string tostring() { return "[pagenum=" + this.pagenum + ",x=" + this.x + ",y=" + this.y + "]"; } } }
总结
以上所述是小编给大家介绍的java实现查找pdf关键字所在页码及其坐标,希望对大家有所帮助
上一篇: 第13届D2大会一些参会感受和总结