Skip to content

Commit

Permalink
PDFBOX-5729: added GSUB worker for Devanagari and test text, by JAVAU…
Browse files Browse the repository at this point in the history
…SER; add language to factory

git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1914485 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
THausherr committed Dec 9, 2023
1 parent c5a85a7 commit 58f763e
Show file tree
Hide file tree
Showing 3 changed files with 327 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
import org.apache.fontbox.ttf.model.GsubData;
import org.apache.fontbox.ttf.model.Language;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
* Gets a {@link Language} specific instance of a {@link GsubWorker}
*
Expand All @@ -29,19 +32,25 @@
*/
public class GsubWorkerFactory
{
private static final Logger LOG = LogManager.getLogger(GsubWorkerFactory.class);

public GsubWorker getGsubWorker(CmapLookup cmapLookup, GsubData gsubData)
{
//TODO this needs to be redesigned / improved because if a font supports several languages,
// it will choose one of them and maybe not the one expected.
LOG.debug("Language: {}", gsubData.getLanguage());
switch (gsubData.getLanguage())
{
case BENGALI:
return new GsubWorkerForBengali(cmapLookup, gsubData);
case DEVANAGARI:
return new GsubWorkerForDevanagari(cmapLookup, gsubData);
//case GUJARATI:
// return new GsubWorkerForGujarati(cmapLookup, gsubData);
case LATIN:
return new GsubWorkerForLatin(cmapLookup, gsubData);
default:
return new DefaultGsubWorker();
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.fontbox.ttf.gsub;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.apache.fontbox.ttf.CmapLookup;
import org.apache.fontbox.ttf.model.GsubData;
import org.apache.fontbox.ttf.model.ScriptFeature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
*
* Devanagari-specific implementation of GSUB system
*
* @author JAVAUSER
*
*/
public class GsubWorkerForDevanagari implements GsubWorker
{
private static final Logger LOG = LogManager.getLogger(GsubWorkerForDevanagari.class);

private static final String RKRF_FEATURE = "rkrf";
private static final String VATU_FEATURE = "vatu";

/**
* This sequence is very important. This has been taken from <a href=
* "https://docs.microsoft.com/en-us/typography/script-development/devanagari">https://docs.microsoft.com/en-us/typography/script-development/devanagari</a>
*/
private static final List<String> FEATURES_IN_ORDER = Arrays.asList("locl", "nukt", "akhn",
"rphf", RKRF_FEATURE,"blwf", "half", VATU_FEATURE, "cjct", "pres", "abvs", "blws",
"psts", "haln", "calt");

// Reph glyphs
private static final char[] REPH_CHARS = {'\u0930', '\u094D'};
// Glyphs to precede reph
private static final char[] BEFORE_REPH_CHARS={'\u093E','\u0940'};

// Devanagari vowel sign I
private static final char BEFORE_HALF_CHAR = '\u093F';

private final CmapLookup cmapLookup;
private final GsubData gsubData;

private final List<Integer> rephGlyphIds;
private final List<Integer> beforeRephGlyphIds;
private final List<Integer> beforeHalfGlyphIds;

GsubWorkerForDevanagari(CmapLookup cmapLookup, GsubData gsubData)
{
this.cmapLookup = cmapLookup;
this.gsubData = gsubData;
beforeHalfGlyphIds = getBeforeHalfGlyphIds();
rephGlyphIds = getRephGlyphIds();
beforeRephGlyphIds = getbeforeRephGlyphIds();
}

@Override
public List<Integer> applyTransforms(List<Integer> originalGlyphIds)
{
List<Integer> intermediateGlyphsFromGsub = adjustRephPosition(originalGlyphIds);
intermediateGlyphsFromGsub = repositionGlyphs(intermediateGlyphsFromGsub);
for (String feature : FEATURES_IN_ORDER)
{
if (!gsubData.isFeatureSupported(feature))
{
if (feature.equals(RKRF_FEATURE) && gsubData.isFeatureSupported(VATU_FEATURE))
{
// Create your own rkrf feature from vatu feature
intermediateGlyphsFromGsub = applyRKRFFeature(
gsubData.getFeature(VATU_FEATURE),
intermediateGlyphsFromGsub);
}
LOG.debug("the feature {} was not found", feature);
continue;
}

LOG.debug("applying the feature {}", feature);
ScriptFeature scriptFeature = gsubData.getFeature(feature);
intermediateGlyphsFromGsub = applyGsubFeature(scriptFeature,
intermediateGlyphsFromGsub);
}
return Collections.unmodifiableList(intermediateGlyphsFromGsub);
}

private List<Integer> applyRKRFFeature(ScriptFeature rkrfGlyphsForSubstitution,
List<Integer> originalGlyphIds)
{
Set<List<Integer>> rkrfGlyphIds = rkrfGlyphsForSubstitution.getAllGlyphIdsForSubstitution();
if (rkrfGlyphIds.isEmpty())
{
LOG.debug("Glyph substitution list for {} is empty.", rkrfGlyphsForSubstitution.getName());
return originalGlyphIds;
}
// Replace this with better implementation to get second GlyphId from rkrfGlyphIds
int rkrfReplacement = 0;
for (List<Integer> firstList : rkrfGlyphIds)
{
if (firstList.size() > 1)
{
rkrfReplacement = firstList.get(1);
break;
}
}

if (rkrfReplacement == 0)
{
LOG.debug("Cannot find rkrf candidate. The rkrfGlyphIds doesn't contain lists of two elements.");
return originalGlyphIds;
}

List<Integer> rkrfList = new ArrayList<>(originalGlyphIds);
for (int index = originalGlyphIds.size() - 1; index > 1; index--)
{
int raGlyph = originalGlyphIds.get(index);
if (raGlyph == rephGlyphIds.get(0))
{
int viramaGlyph = originalGlyphIds.get(index - 1);
if (viramaGlyph == rephGlyphIds.get(1))
{
rkrfList.set(index - 1, rkrfReplacement);
rkrfList.remove(index);
}
}
}
return rkrfList;
}

private List<Integer> adjustRephPosition(List<Integer> originalGlyphIds)
{
List<Integer> rephAdjustedList = new ArrayList<>(originalGlyphIds);
for (int index = 0; index < originalGlyphIds.size() - 2; index++)
{
int raGlyph = originalGlyphIds.get(index);
int viramaGlyph = originalGlyphIds.get(index + 1);
if (raGlyph == rephGlyphIds.get(0) && viramaGlyph == rephGlyphIds.get(1))
{
int nextConsonantGlyph = originalGlyphIds.get(index + 2);
rephAdjustedList.set(index, nextConsonantGlyph);
rephAdjustedList.set(index + 1, raGlyph);
rephAdjustedList.set(index + 2, viramaGlyph);

if (index + 3 < originalGlyphIds.size())
{
int matraGlyph = originalGlyphIds.get(index + 3);
if (beforeRephGlyphIds.contains(matraGlyph))
{
rephAdjustedList.set(index + 1, matraGlyph);
rephAdjustedList.set(index + 2, raGlyph);
rephAdjustedList.set(index + 3, viramaGlyph);
}
}
}
}
return rephAdjustedList;
}

private List<Integer> repositionGlyphs(List<Integer> originalGlyphIds)
{
List<Integer> repositionedGlyphIds = new ArrayList<>(originalGlyphIds);
int listSize = repositionedGlyphIds.size();
int foundIndex = listSize - 1;
int nextIndex = listSize - 2;
while (nextIndex > -1)
{
int glyph = repositionedGlyphIds.get(foundIndex);
int prevIndex = foundIndex + 1;
if (beforeHalfGlyphIds.contains(glyph))
{
repositionedGlyphIds.remove(foundIndex);
repositionedGlyphIds.add(nextIndex--, glyph);
}
else if (rephGlyphIds.get(1).equals(glyph) && prevIndex < listSize)
{
int prevGlyph = repositionedGlyphIds.get(prevIndex);
if (beforeHalfGlyphIds.contains(prevGlyph))
{
repositionedGlyphIds.remove(prevIndex);
repositionedGlyphIds.add(nextIndex--, prevGlyph);
}
}
foundIndex = nextIndex--;
}
return repositionedGlyphIds;
}

private List<Integer> applyGsubFeature(ScriptFeature scriptFeature, List<Integer> originalGlyphs)
{
Set<List<Integer>> allGlyphIdsForSubstitution = scriptFeature.getAllGlyphIdsForSubstitution();
if (allGlyphIdsForSubstitution.isEmpty())
{
LOG.debug("getAllGlyphIdsForSubstitution() for {} is empty", scriptFeature.getName());
return originalGlyphs;
}
GlyphArraySplitter glyphArraySplitter = new GlyphArraySplitterRegexImpl(
allGlyphIdsForSubstitution);
List<List<Integer>> tokens = glyphArraySplitter.split(originalGlyphs);
List<Integer> gsubProcessedGlyphs = new ArrayList<>(tokens.size());
tokens.forEach(chunk ->
{
if (scriptFeature.canReplaceGlyphs(chunk))
{
Integer glyphId = scriptFeature.getReplacementForGlyphs(chunk);
gsubProcessedGlyphs.add(glyphId);
}
else
{
gsubProcessedGlyphs.addAll(chunk);
}
});
LOG.debug("originalGlyphs: {}, gsubProcessedGlyphs: {}", originalGlyphs, gsubProcessedGlyphs);
return gsubProcessedGlyphs;
}

private List<Integer> getBeforeHalfGlyphIds()
{
List<Integer> glyphIds = new ArrayList<>();
glyphIds.add(getGlyphId(BEFORE_HALF_CHAR));
return Collections.unmodifiableList(glyphIds);
}

private List<Integer> getRephGlyphIds()
{
List<Integer> result = new ArrayList<>();
for (char character : REPH_CHARS)
{
result.add(getGlyphId(character));
}
return Collections.unmodifiableList(result);
}

private List<Integer> getbeforeRephGlyphIds()
{
List<Integer> glyphIds = new ArrayList<>();
for (char character : BEFORE_REPH_CHARS)
{
glyphIds.add(getGlyphId(character));
}
return Collections.unmodifiableList(glyphIds);
}

private Integer getGlyphId(char character)
{
return cmapLookup.getGlyphId(character);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
Expand All @@ -45,6 +48,7 @@
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
Expand Down Expand Up @@ -245,6 +249,52 @@ void testBengali() throws IOException
//assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim());
}

@Test
void testDevanagari() throws IOException
{
String DEVANAGARI_TEXT = "प्रदेश ग्रामीण व्यवसायिक, लक्ष्मिपति, लक्षित, मक्खि उपलब्धि, प्रसिद्धि";

String expectedExtractedtext = DEVANAGARI_TEXT;
File pdf = new File(OUT_DIR, "Devanagari.pdf");

try (PDDocument document = new PDDocument())
{
PDPage page = new PDPage(PDRectangle.A4);
document.addPage(page);
PDFont font = PDType0Font.load(document,
this.getClass().getResourceAsStream("/org/apache/pdfbox/ttf/Lohit-Devanagari.ttf"));

try (PDPageContentStream contentStream = new PDPageContentStream(document, page))
{
contentStream.beginText();
contentStream.setFont(font, 20);
contentStream.newLineAtOffset(50, 700);
contentStream.showText(DEVANAGARI_TEXT);
contentStream.endText();
}

document.save(pdf);
}

File IN_DIR = new File("src/test/resources/org/apache/pdfbox/ttf");

// compare rendering
if (!TestPDFToImage.doTestFile(pdf, IN_DIR.getAbsolutePath(), OUT_DIR.getAbsolutePath()))
{
// don't fail, rendering is different on different systems, result must be viewed manually
fail("Rendering of " + pdf + " failed or is not identical to expected rendering in " + IN_DIR + " directory");
}

// Check text extraction
String extracted = getUnicodeText(pdf);

try (OutputStream os = new FileOutputStream(new File(OUT_DIR, "Devanagari.txt")))
{
os.write(extracted.getBytes(StandardCharsets.UTF_8));
//assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim());
}
}

/**
* Test corner case of PDFBOX-4302.
*
Expand Down

0 comments on commit 58f763e

Please sign in to comment.