iText 读取pdf的各级标题和标题对应页码 博客分类: 文档处理Java Web iTextword标题页码
程序员文章站
2024-03-25 18:59:58
...
因为要做全文检索的项目,需要获取pdf文件的指定标题和标题对应的页码,上网查了相关资料,决定用iText开发包,废话不多说直接上代码。
import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.SimpleBookmark; public class TestPdf { public static void main ( String [] args ) throws Exception { PdfReader reader = new PdfReader ( "e:/test.pdf" ) ; List<HashMap<String, Object>> list = SimpleBookmark.getBookmark ( reader ) ; for ( Iterator<HashMap<String, Object>> i = list.iterator () ; i.hasNext () ; ) { showBookmark ( i.next ()) ; } for ( Iterator<HashMap<String, Object>> i = list.iterator () ; i.hasNext () ; ) { getPageNumbers( i.next ()); } } //获取标题 private static void showBookmark ( HashMap<String, Object> bookmark) { System.out.println (bookmark.get ( "Title" )) ; @SuppressWarnings("unchecked") ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get ( "Kids" ) ; if ( kids == null ) return ; for ( Iterator<HashMap<String, Object>> i = kids.iterator () ; i.hasNext () ; ) { showBookmark ( i.next ()) ; } } //获取页码 public static void getPageNumbers(HashMap<String, Object> bookmark) { if (bookmark == null) return; if ("GoTo".equals(bookmark.get("Action"))) { String page = (String)bookmark.get("Page"); if (page != null) { page = page.trim(); int idx = page.indexOf(' '); int pageNum; if (idx < 0){ pageNum = Integer.parseInt(page); System.out.println ("pageNum :"+ pageNum) ; } else{ pageNum = Integer.parseInt(page.substring(0, idx)); System.out.println ("pageNum:" +pageNum) ; } } @SuppressWarnings("unchecked") ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get ( "Kids" ) ; if ( kids == null ) return ; for ( Iterator<HashMap<String, Object>> i = kids.iterator () ; i.hasNext () ; ) { getPageNumbers ( i.next ()) ; } } } }