Skip to content

Commit

Permalink
PDFBOX-5487: Remove all space characters if contained within the adja…
Browse files Browse the repository at this point in the history
…cent letters, by Mohamed M NourElDin; closes #155

git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1922514 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
THausherr committed Dec 15, 2024
1 parent e4f814c commit 374972f
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
33 changes: 33 additions & 0 deletions pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,11 @@ protected void writePage() throws IOException
{
IterativeMergeSort.sort(textList, comparator);
}
finally
{
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
removeContainedSpaces(textList);
}
}

startArticle();
Expand Down Expand Up @@ -724,6 +729,34 @@ private boolean overlap(float y1, float height1, float y2, float height2)
|| y1 <= y2 && y1 >= y2 - height2;
}

/**
* Remove all space characters if contained within the adjacent letters
*/
private void removeContainedSpaces(List<TextPosition> textList)
{
TextPosition position, previousPosition;
Iterator<TextPosition> iterator = textList.iterator();

if (!iterator.hasNext())
{
return;
}
previousPosition = iterator.next();

while (iterator.hasNext())
{
position = iterator.next();
if (" ".equals(position.getUnicode()) && previousPosition.completelyContains(position))
{
iterator.remove();
}
else
{
previousPosition = position;
}
}
}

/**
* Write the line separator value to the output stream.
*
Expand Down
49 changes: 49 additions & 0 deletions pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,55 @@ else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
return true;
}

/**
* Determine if this TextPosition perfectly contains another (i.e. the other TextPosition
* overlaps 100% with this one and fits entirely inside its bounding box when they are rendered
* on top of each other).
*
* @param tp2 The other TestPosition to compare against
* @return True if tp2 is contained completely inside the bounding box of this text.
*/
public boolean completelyContains(TextPosition tp2)
{
// Note: (0, 0) is in the upper left and y-coordinate is top of TextPosition
// +---thisTop------------+
// | +--tp2Top---+ |
// | | | |
// thisLeft | tp2Right |
// | tp2Left | thisRight
// | | | |
// | +-tp2Bottom-+ |
// +---------thisBottom---+

float thisLeft = getXDirAdj();
float thisWidth = getWidthDirAdj();
float thisRight = thisLeft + thisWidth;

float tp2Left = tp2.getXDirAdj();
float tp2Width = tp2.getWidthDirAdj();
float tp2Right = tp2Left + tp2Width;

if (thisLeft > tp2Left || tp2Right > thisRight)
{
return false;
}

float thisTop = getYDirAdj();
float thisHeight = getHeightDir();
float thisBottom = thisTop + thisHeight;

float tp2Top = tp2.getYDirAdj();
float tp2Height = tp2.getHeightDir();
float tp2Bottom = tp2Top + tp2Height;

if (thisTop > tp2Top || tp2Bottom > thisBottom)
{
return false;
}

return true;
}

/**
* Merge a single character TextPosition into the current object. This is to be used only for
* cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
Expand Down

0 comments on commit 374972f

Please sign in to comment.