diff --git a/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java b/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java index ecc413935d4..a39d0ad60cc 100644 --- a/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java +++ b/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java @@ -90,6 +90,9 @@ public final class ExtractText implements Callable @Option(names = "-html", description = "Output in HTML format instead of raw text") private boolean toHTML = false; + @Option(names = "-md", description = "Output in Markdown format instead of raw text") + private boolean toMD = false; + @Option(names = "-ignoreBeads", description = "Disables the separation by beads") private boolean ignoreBeads = false; @@ -148,7 +151,13 @@ public static void main( String[] args ) public Integer call() { // set file extension + if (toHTML && toMD) + { + SYSERR.println( "You can't set md and html at the same time"); + return 1; + } String ext = toHTML ? ".html" : ".txt"; + ext = toMD ? ".md" : ext; if (outfile == null) { @@ -175,7 +184,8 @@ public Integer call() AccessPermission ap = document.getCurrentAccessPermission(); if( ! ap.canExtractContent() ) { - SYSERR.println( "You do not have permission to extract text"); + System.out.println( "You do not have permission to extract text" ); + //SYSERR.println( "You do not have permission to extract text"); return 1; } @@ -209,13 +219,27 @@ public Integer call() } else { - if (rotationMagic) + if (toMD) { - stripper = new FilteredTextStripper(); + if (rotationMagic) + { + stripper = new FilteredText2Markdown(); + } + else + { + stripper = new PDFText2Markdown(); + } } else { - stripper = new PDFTextStripper(); + if (rotationMagic) + { + stripper = new FilteredTextStripper(); + } + else + { + stripper = new PDFTextStripper(); + } } stripper.setSortByPosition(sort); stripper.setShouldSeparateByBeads(!ignoreBeads); @@ -306,6 +330,7 @@ private void extractPages(int startPage, int endPage, { for (int p = startPage; p <= endPage; ++p) { + //System.err.println("page " + p); stripper.setStartPage(p); stripper.setEndPage(p); try @@ -414,10 +439,22 @@ protected void processTextPosition(TextPosition text) */ class FilteredTextStripper extends PDFTextStripper { - FilteredTextStripper() throws IOException + @Override + protected void processTextPosition(TextPosition text) { + int angle = ExtractText.getAngle(text); + if (angle == 0) + { + super.processTextPosition(text); + } } +} +/** + * PDFText2Markdown that only processes glyphs that have angle 0. + */ +class FilteredText2Markdown extends PDFText2Markdown +{ @Override protected void processTextPosition(TextPosition text) {