java实现查找PDF关键字所在页码及其坐标

程序员文章站 2022-04-16 15:53:36
1、因为最近有这方面的需求，用过之后记录一下。 2、此功能跟pdf中ctrl+f性质一样，如果pdf中为图片形式的不支持定位到关键字。 import com.i...
1、因为最近有这方面的需求，用过之后记录一下。
2、此功能跟pdf中ctrl+f性质一样，如果pdf中为图片形式的不支持定位到关键字。
import com.itextpdf.awt.geom.rectangle2d.float;
import com.itextpdf.text.pdf.pdfdictionary;
import com.itextpdf.text.pdf.pdfname;
import com.itextpdf.text.pdf.pdfreader;
import com.itextpdf.text.pdf.parser.*;
import java.io.file;
import java.io.fileinputstream;
import java.io.ioexception;
import java.util.arraylist;
import java.util.list;
/**
 * 消失的太阳
 */
public class mytest {
 public static void main(string[] args) throws ioexception {
  //1.给定文件
  file pdffile = new file("d://test.pdf");
  //2.定义一个byte数组，长度为文件的长度
  byte[] pdfdata = new byte[(int) pdffile.length()];
  //3.io流读取文件内容到byte数组
  fileinputstream inputstream = null;
  try {
   inputstream = new fileinputstream(pdffile);
   inputstream.read(pdfdata);
  } catch (ioexception e) {
   throw e;
  } finally {
   if (inputstream != null) {
    try {
     inputstream.close();
    } catch (ioexception e) {
    }
   }
  }
  //4.指定关键字
  string keyword = "消失的太阳：";
  //5.调用方法，给定关键字和文件
  list<float[]> positions = findkeywordpostions(pdfdata, keyword);
  //6.返回值类型是 list<float[]> 每个list元素代表一个匹配的位置，分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴
  system.out.println("total:" + positions.size());
  if (positions != null && positions.size() > 0) {
   for (float[] position : positions) {
    system.out.print("pagenum: " + (int) position[0]);
    system.out.print("\tx: " + position[1]);
    system.out.println("\ty: " + position[2]);
   }
  }
 }
 /**
  * findkeywordpostions
  * @param pdfdata  通过io流 pdf文件转化的byte数组
  * @param keyword  关键字
  * @return list<float [ ]> : float[0]:pagenum float[1]:x float[2]:y
  * @throws ioexception
  */
 public static list<float[]> findkeywordpostions(byte[] pdfdata, string keyword) throws ioexception {
  list<float[]> result = new arraylist<>();
  list<pdfpagecontentpositions> pdfpagecontentpositions = getpdfcontentpostionslist(pdfdata);
  for (pdfpagecontentpositions pdfpagecontentposition : pdfpagecontentpositions) {
   list<float[]> charpositions = findpositions(keyword, pdfpagecontentposition);
   if (charpositions == null || charpositions.size() < 1) {
    continue;
   }
   result.addall(charpositions);
  }
  return result;
 }
 private static list<pdfpagecontentpositions> getpdfcontentpostionslist(byte[] pdfdata) throws ioexception {
  pdfreader reader = new pdfreader(pdfdata);
  list<pdfpagecontentpositions> result = new arraylist<>();
  int pages = reader.getnumberofpages();
  for (int pagenum = 1; pagenum <= pages; pagenum++) {
   float width = reader.getpagesize(pagenum).getwidth();
   float height = reader.getpagesize(pagenum).getheight();
   pdfrenderlistener pdfrenderlistener = new pdfrenderlistener(pagenum, width, height);
   //解析pdf，定位位置
   pdfcontentstreamprocessor processor = new pdfcontentstreamprocessor(pdfrenderlistener);
   pdfdictionary pagedic = reader.getpagen(pagenum);
   pdfdictionary resourcesdic = pagedic.getasdict(pdfname.resources);
   try {
    processor.processcontent(contentbyteutils.getcontentbytesforpage(reader, pagenum), resourcesdic);
   } catch (ioexception e) {
    reader.close();
    throw e;
   }
   string content = pdfrenderlistener.getcontent();
   list<charposition> charpositions = pdfrenderlistener.getcharpositions();
   list<float[]> positionslist = new arraylist<>();
   for (charposition charposition : charpositions) {
    float[] positions = new float[]{charposition.getpagenum(), charposition.getx(), charposition.gety()};
    positionslist.add(positions);
   }
   pdfpagecontentpositions pdfpagecontentpositions = new pdfpagecontentpositions();
   pdfpagecontentpositions.setcontent(content);
   pdfpagecontentpositions.setpostions(positionslist);
   result.add(pdfpagecontentpositions);
  }
  reader.close();
  return result;
 }
 private static list<float[]> findpositions(string keyword, pdfpagecontentpositions pdfpagecontentpositions) {
  list<float[]> result = new arraylist<>();
  string content = pdfpagecontentpositions.getcontent();
  list<float[]> charpositions = pdfpagecontentpositions.getpositions();
  for (int pos = 0; pos < content.length(); ) {
   int positionindex = content.indexof(keyword, pos);
   if (positionindex == -1) {
    break;
   }
   float[] postions = charpositions.get(positionindex);
   result.add(postions);
   pos = positionindex + 1;
  }
  return result;
 }
 private static class pdfpagecontentpositions {
  private string content;
  private list<float[]> positions;
  public string getcontent() {
   return content;
  }
  public void setcontent(string content) {
   this.content = content;
  }
  public list<float[]> getpositions() {
   return positions;
  }
  public void setpostions(list<float[]> positions) {
   this.positions = positions;
  }
 }
 private static class pdfrenderlistener implements renderlistener {
  private int pagenum;
  private float pagewidth;
  private float pageheight;
  private stringbuilder contentbuilder = new stringbuilder();
  private list<charposition> charpositions = new arraylist<>();
  public pdfrenderlistener(int pagenum, float pagewidth, float pageheight) {
   this.pagenum = pagenum;
   this.pagewidth = pagewidth;
   this.pageheight = pageheight;
  }
  public void begintextblock() {
  }
  public void rendertext(textrenderinfo renderinfo) {
   list<textrenderinfo> characterrenderinfos = renderinfo.getcharacterrenderinfos();
   for (textrenderinfo textrenderinfo : characterrenderinfos) {
    string word = textrenderinfo.gettext();
    if (word.length() > 1) {
     word = word.substring(word.length() - 1, word.length());
    }
    float rectangle = textrenderinfo.getascentline().getboundingrectange();
    float x = (float)rectangle.getx();
    float y = (float)rectangle.gety();
//    float x = (float)rectangle.getcenterx();
//    float y = (float)rectangle.getcentery();
//    double x = rectangle.getminx();
//    double y = rectangle.getmaxy();
    //这两个是关键字在所在页面的xy轴的百分比
    float xpercent = math.round(x / pagewidth * 10000) / 10000f;
    float ypercent = math.round((1 - y / pageheight) * 10000) / 10000f;
//    charposition charposition = new charposition(pagenum, xpercent, ypercent);
    charposition charposition = new charposition(pagenum, (float)x, (float)y);
    charpositions.add(charposition);
    contentbuilder.append(word);
   }
  }
  public void endtextblock() {
  }
  public void renderimage(imagerenderinfo renderinfo) {
  }
  public string getcontent() {
   return contentbuilder.tostring();
  }
  public list<charposition> getcharpositions() {
   return charpositions;
  }
 }
 private static class charposition {
  private int pagenum = 0;
  private float x = 0;
  private float y = 0;
  public charposition(int pagenum, float x, float y) {
   this.pagenum = pagenum;
   this.x = x;
   this.y = y;
  }
  public int getpagenum() {
   return pagenum;
  }
  public float getx() {
   return x;
  }
  public float gety() {
   return y;
  }
  @override
  public string tostring() {
   return "[pagenum=" + this.pagenum + ",x=" + this.x + ",y=" + this.y + "]";
  }
 }
}
总结
以上所述是小编给大家介绍的java实现查找pdf关键字所在页码及其坐标,希望对大家有所帮助
上一篇：第13届D2大会一些参会感受和总结
下一篇：国画和西画有哪些区别？中国画重神韵，西方画重形似