昨天研究一天的对pdf关键字定位,走了不少弯路,网上找的好多有瑕疵,今天有时间跟大家分享下:
以下方法是对pdf每页内容进行扫描定位出关键字的大致坐标。
import com.google.common.collect.Lists;import com.itextpdf.text.DocumentException;import com.itextpdf.text.pdf.AcroFields;import com.itextpdf.text.pdf.AcroFields.FieldPosition;import com.itextpdf.text.pdf.PdfReader;import com.itextpdf.text.pdf.PdfStamper;import com.itextpdf.text.pdf.parser.ImageRenderInfo;import com.itextpdf.text.pdf.parser.PdfReaderContentParser;import com.itextpdf.text.pdf.parser.RenderListener;import com.itextpdf.text.pdf.parser.TextRenderInfo;
// 定义返回页码 PRivate static int i = 0; private static com.itextpdf.awt.geom.Rectangle2D.Float boundingRectange =null; private static StringBuilder content; private static List<Object[]> arrays = Lists.newArrayList();
private static List<Object[]> getKeyWords(String filePath, final String keyWord) {
try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader); for (i = 1; i < pageNum; i++) { content = new StringBuilder(); boundingRectange =new com.itextpdf.awt.geom.Rectangle2D.Float(); pdfReaderContentParser.processContent(i, new RenderListener() { @Override public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); // 整页内容 content.append(text); boundingRectange= textRenderInfo.getBaseline().getBoundingRectange(); /*if (null != text && StringUtils.contains(content, keyWord)) { float[] resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; arrays.add(resu); }*/ } @Override public void renderImage(ImageRenderInfo arg0) { // TODO Auto-generated method stub } @Override public void endTextBlock() { // TODO Auto-generated method stub } @Override public void beginTextBlock() { // TODO Auto-generated method stub } }); if (null != content && StringUtils.contains(content, keyWord)) { Object[] resu = new Object[4]; resu[0] = content; resu[1] = boundingRectange.x; resu[2] = boundingRectange.y; resu[3] = i; arrays.add(resu); } // System.out.println("第"+i+"页,内容:"+content); } } catch (IOException e) { e.printStackTrace(); } return arrays; }以上方法中使用到的jar包
itextpdf-5.5.6.jar
新闻热点
疑难解答