上代码:
package com.pdfbox.util.test
import org.apache.pdfbox.exceptions.InvalidPasswordException
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.util.PDFTextStripperByArea
import java.awt.Rectangle
import java.util.List
public class ExtractTextByArea
{
ITEXT插件方法/**
* @param pdf PDF文件路径
* @param txt 输出文本文件路径
* @throws IOException
*/
public void parsePdf(String pdf, String txt) throws IOException {
PdfReader reader = new PdfReader(pdf)
PrintWriter out = new PrintWriter(new FileOutputStream(txt))
Rectangle rect = new Rectangle(70, 80, 490, 580)
RenderFilter filter = new RegionTextRenderFilter(rect)
TextExtractionStrategy strategy
for (int i = 1i <= reader.getNumberOfPages()i++) {
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter)
out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy))
}
out.flush()
out.close()
reader.close()
}
PDFBOX插件方法PDDocument document = PDDocument.load( args[0] )
if( document.isEncrypted() )
{
document.decrypt( "" )
}
PDFTextStripperByArea stripper = new PDFTextStripperByArea()
stripper.setSortByPosition( true )
Rectangle rect = new Rectangle( 10, 280, 275, 60 )
stripper.addRegion( "class1", rect )
List allPages = document.getDocumentCatalog().getAllPages()
PDPage firstPage = (PDPage)allPages.get( 0 )
stripper.extractRegions( firstPage )
System.out.println( "Text in the area:" + rect )
System.out.println( stripper.getTextForRegion( "class1" ) )