C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。
PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。
文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。
ITestSharp是一C#开源项目,PdfBox为java开源项目,借助于IKVM在.Net平台下有实现。
Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。
OCR使用的是asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。
附上ITestSharp、PdfBox对PDF的解析代码。
ITestSharp辅助类
1 using System; 2 using System.Collections.Generic; 3 using System.Text; 4 5 using iTextSharp.text.pdf; 6 using iTextSharp.text.pdf.parser; 7 using System.IO; 8 9 namespace eyuan 10 { 11 public static class ITextSharpHandler 12 { 13 /// <summary> 14 /// 读取PDF文本内容 15 /// </summary> 16 /// <param name="fileName"></param> 17 /// <returns></returns> 18 public static string ReadPdf(string fileName) 19 { 20 if (!File.Exists(fileName)) 21 { 22 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 23 return string.Empty; 24 } 25 // 26 string fileContent = string.Empty; 27 StringBuilder sbFileContent = new StringBuilder(); 28 //打开文件 29 PdfReader reader = null; 30 try 31 { 32 reader = new PdfReader(fileName); 33 } 34 catch (Exception ex) 35 { 36 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 37 38 if (reader != null) 39 { 40 reader.Close(); 41 reader = null; 42 } 43 44 return string.Empty; 45 } 46 47 try 48 { 49 //循环各页(索引从1开始) 50 for (int i = 1; i <= reader.NumberOfPages; i++) 51 { 52 sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); 53 54 } 55 56 } 57 catch (Exception ex) 58 { 59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 60 61 } 62 finally 63 { 64 if (reader != null) 65 { 66 reader.Close(); 67 reader = null; 68 } 69 } 70 // 71 fileContent = sbFileContent.ToString(); 72 return fileContent; 73 } 74 /// <summary> 75 /// 获取PDF页数 76 /// </summary> 77 /// <param name="fileName"></param> 78 /// <returns></returns> 79 public static int GetPdfPageCount(string fileName) 80 { 81 if (!File.Exists(fileName)) 82 { 83 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 84 return -1; 85 } 86 //打开文件 87 PdfReader reader = null; 88 try 89 { 90 reader = new PdfReader(fileName); 91 } 92 catch (Exception ex) 93 { 94 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 95 96 if (reader != null) 97 { 98 reader.Close(); 99 reader = null;100 }101 102 return -1;103 }104 //105 return reader.NumberOfPages;106 }107 }108 }
PDFBox辅助类
1 using org.pdfbox.pdmodel; 2 using org.pdfbox.util; 3 using System; 4 using System.Collections.Generic; 5 using System.IO; 6 using System.Text; 7 8 namespace eyuan 9 {10 public static class PdfBoxHandler11 {12 /// <summary>13 /// 使用PDFBox组件进行解析14 /// </summary>15 /// <param name="input">PDF文件路径</param>16 /// <returns>PDF文本内容</returns>17 public static string ReadPdf(string input)18 {19 if (!File.Exists(input))20 {21 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);22 return null;23 }24 else25 {26 PDDocument pdfdoc = null;27 string strPDFText = null;28 PDFTextStripper stripper = null;29 30 try31 {32 //加载PDF文件33 pdfdoc = PDDocument.load(input);34 }35 catch (Exception ex)36 {37 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));38 39 if (pdfdoc != null)40 {41 pdfdoc.close();42 pdfdoc = null;43 }44 45 return null;46 }47 48 try49 {50 //解析PDF文件51 stripper = new PDFTextStripper();52 strPDFText = stripper.getText(pdfdoc);53 54 55 56 }57 catch (Exception ex)58 {59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));60 61 }62 finally63 {64 if (pdfdoc != null)65 {66 pdfdoc.close();67 pdfdoc = null;68 }69 }70 71 return strPDFText;72 }73 74 }75 }76 }
另外附上PDF转Image,然后对Image进行OCR的代码。
转换PDF为Jpeg图片代码(GhostScript辅助类)
1 using System; 2 using System.Collections; 3 using System.Collections.Generic; 4 using System.Runtime.InteropServices; 5 using System.Text; 6 7 namespace eyuan 8 { 9 public class GhostscriptHandler 10 { 11 12 #region GhostScript Import 13 /// <summary>创建Ghostscript的实例 14 /// This instance is passed to most other gsapi functions. 15 /// The caller_handle will be PRovided to callback functions. 16 /// At this stage, Ghostscript supports only one instance. </summary> 17 /// <param name="pinstance"></param> 18 /// <param name="caller_handle"></param> 19 /// <returns></returns> 20 [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")] 21 private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle); 22 /// <summary>This is the important function that will perform the conversion 23 /// 24 /// </summary> 25 /// <param name="instance"></param> 26 /// <param name="argc"></param> 27 /// <param name="argv"></param> 28 /// <returns></returns> 29 [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")] 30 private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv); 31 /// <summary> 32 /// Exit the interpreter. 33 /// This must be called on shutdown if gsapi_init_with_args() has been called, 34 /// and just before gsapi_delete_instance(). 35 /// 退出 36 /// </summary> 37 /// <param name="instance"></param> 38 /// <returns></returns> 39 [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")] 40 private static extern int gsapi_exit(IntPtr instance); 41 /// <summary> 42 /// Destroy an instance
新闻热点
疑难解答