PDFBOX-5729: added GSUB worker for Devanagari and test text, by JAVAU…

…SER; add language to factory git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1914485 13f79535-47bb-0310-9956-ffa450edef68
apache · Dec 9, 2023 · 58f763e · 58f763e
1 parent c5a85a7
commit 58f763e
Show file tree

Hide file tree

Showing 3 changed files with 327 additions and 2 deletions.
diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java
@@ -21,6 +21,9 @@
 import org.apache.fontbox.ttf.model.GsubData;
 import org.apache.fontbox.ttf.model.Language;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
 /**
  * Gets a {@link Language} specific instance of a {@link GsubWorker}
  * 
@@ -29,19 +32,25 @@
  */
 public class GsubWorkerFactory
 {
+    private static final Logger LOG = LogManager.getLogger(GsubWorkerFactory.class);
 
     public GsubWorker getGsubWorker(CmapLookup cmapLookup, GsubData gsubData)
     {
+        //TODO this needs to be redesigned / improved because if a font supports several languages,
+        // it will choose one of them and maybe not the one expected.
+        LOG.debug("Language: {}", gsubData.getLanguage());
         switch (gsubData.getLanguage())
         {
         case BENGALI:
             return new GsubWorkerForBengali(cmapLookup, gsubData);
+        case DEVANAGARI:
+            return new GsubWorkerForDevanagari(cmapLookup, gsubData);
+        //case GUJARATI:
+        //    return new GsubWorkerForGujarati(cmapLookup, gsubData);
         case LATIN:
             return new GsubWorkerForLatin(cmapLookup, gsubData);
         default:
             return new DefaultGsubWorker();
         }
-
     }
-
 }
diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerForDevanagari.java b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerForDevanagari.java
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fontbox.ttf.gsub;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.CmapLookup;
+import org.apache.fontbox.ttf.model.GsubData;
+import org.apache.fontbox.ttf.model.ScriptFeature;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+/**
+ * 
+ * Devanagari-specific implementation of GSUB system
+ * 
+ * @author JAVAUSER
+ *
+ */
+public class GsubWorkerForDevanagari implements GsubWorker
+{
+    private static final Logger LOG = LogManager.getLogger(GsubWorkerForDevanagari.class);
+
+    private static final String RKRF_FEATURE = "rkrf";
+    private static final String VATU_FEATURE = "vatu";
+
+    /**
+     * This sequence is very important. This has been taken from <a href=
+     * "https://docs.microsoft.com/en-us/typography/script-development/devanagari">https://docs.microsoft.com/en-us/typography/script-development/devanagari</a>
+     */
+    private static final List<String> FEATURES_IN_ORDER = Arrays.asList("locl", "nukt", "akhn",
+            "rphf", RKRF_FEATURE,"blwf", "half", VATU_FEATURE, "cjct", "pres", "abvs", "blws",
+            "psts", "haln", "calt");
+
+    // Reph glyphs
+    private static final char[] REPH_CHARS = {'\u0930', '\u094D'};
+    // Glyphs to precede reph
+    private static final char[] BEFORE_REPH_CHARS={'\u093E','\u0940'};
+
+    // Devanagari vowel sign I
+    private static final char BEFORE_HALF_CHAR = '\u093F';
+
+    private final CmapLookup cmapLookup;
+    private final GsubData gsubData;
+
+    private final List<Integer> rephGlyphIds;
+    private final List<Integer> beforeRephGlyphIds;
+    private final List<Integer> beforeHalfGlyphIds;
+
+    GsubWorkerForDevanagari(CmapLookup cmapLookup, GsubData gsubData)
+    {
+        this.cmapLookup = cmapLookup;
+        this.gsubData = gsubData;
+        beforeHalfGlyphIds = getBeforeHalfGlyphIds();
+        rephGlyphIds = getRephGlyphIds();
+        beforeRephGlyphIds = getbeforeRephGlyphIds();
+    }
+
+    @Override
+    public List<Integer> applyTransforms(List<Integer> originalGlyphIds)
+    {
+        List<Integer> intermediateGlyphsFromGsub = adjustRephPosition(originalGlyphIds);
+        intermediateGlyphsFromGsub = repositionGlyphs(intermediateGlyphsFromGsub);
+        for (String feature : FEATURES_IN_ORDER)
+        {
+            if (!gsubData.isFeatureSupported(feature))
+            {
+                if (feature.equals(RKRF_FEATURE) && gsubData.isFeatureSupported(VATU_FEATURE))
+                {
+                    // Create your own rkrf feature from vatu feature
+                    intermediateGlyphsFromGsub = applyRKRFFeature(
+                            gsubData.getFeature(VATU_FEATURE),
+                            intermediateGlyphsFromGsub);
+                }
+                LOG.debug("the feature {} was not found", feature);
+                continue;
+            }
+
+            LOG.debug("applying the feature {}", feature);
+            ScriptFeature scriptFeature = gsubData.getFeature(feature);
+            intermediateGlyphsFromGsub = applyGsubFeature(scriptFeature,
+                    intermediateGlyphsFromGsub);
+        }
+        return Collections.unmodifiableList(intermediateGlyphsFromGsub);
+    }
+
+    private List<Integer> applyRKRFFeature(ScriptFeature rkrfGlyphsForSubstitution,
+            List<Integer> originalGlyphIds)
+    {
+        Set<List<Integer>> rkrfGlyphIds = rkrfGlyphsForSubstitution.getAllGlyphIdsForSubstitution();
+        if (rkrfGlyphIds.isEmpty())
+        {
+            LOG.debug("Glyph substitution list for {} is empty.", rkrfGlyphsForSubstitution.getName());
+            return originalGlyphIds;
+        }
+        // Replace this with better implementation to get second GlyphId from rkrfGlyphIds
+        int rkrfReplacement = 0;
+        for (List<Integer> firstList : rkrfGlyphIds)
+        {
+            if (firstList.size() > 1)
+            {
+                rkrfReplacement = firstList.get(1);
+                break;
+            }
+        }
+
+        if (rkrfReplacement == 0)
+        {
+            LOG.debug("Cannot find rkrf candidate. The rkrfGlyphIds doesn't contain lists of two elements.");
+            return originalGlyphIds;
+        }
+
+        List<Integer> rkrfList = new ArrayList<>(originalGlyphIds);
+        for (int index = originalGlyphIds.size() - 1; index > 1; index--)
+        {
+            int raGlyph = originalGlyphIds.get(index);
+            if (raGlyph == rephGlyphIds.get(0))
+            {
+                int viramaGlyph = originalGlyphIds.get(index - 1);
+                if (viramaGlyph == rephGlyphIds.get(1))
+                {
+                    rkrfList.set(index - 1, rkrfReplacement);
+                    rkrfList.remove(index);
+                }
+            }
+        }
+        return rkrfList;
+    }
+
+    private List<Integer> adjustRephPosition(List<Integer> originalGlyphIds)
+    {
+        List<Integer> rephAdjustedList = new ArrayList<>(originalGlyphIds);
+        for (int index = 0; index < originalGlyphIds.size() - 2; index++)
+        {
+            int raGlyph = originalGlyphIds.get(index);
+            int viramaGlyph = originalGlyphIds.get(index + 1);
+            if (raGlyph == rephGlyphIds.get(0) && viramaGlyph == rephGlyphIds.get(1))
+            {
+                int nextConsonantGlyph = originalGlyphIds.get(index + 2);
+                rephAdjustedList.set(index, nextConsonantGlyph);
+                rephAdjustedList.set(index + 1, raGlyph);
+                rephAdjustedList.set(index + 2, viramaGlyph);
+
+                if (index + 3 < originalGlyphIds.size())
+                {
+                    int matraGlyph = originalGlyphIds.get(index + 3);
+                    if (beforeRephGlyphIds.contains(matraGlyph))
+                    {
+                        rephAdjustedList.set(index + 1, matraGlyph);
+                        rephAdjustedList.set(index + 2, raGlyph);
+                        rephAdjustedList.set(index + 3, viramaGlyph);
+                    }
+                }
+            }
+        }
+        return rephAdjustedList;
+    }
+
+    private List<Integer> repositionGlyphs(List<Integer> originalGlyphIds)
+    {
+        List<Integer> repositionedGlyphIds = new ArrayList<>(originalGlyphIds);
+        int listSize = repositionedGlyphIds.size();
+        int foundIndex = listSize - 1;
+        int nextIndex = listSize - 2;
+        while (nextIndex > -1)
+        {
+            int glyph = repositionedGlyphIds.get(foundIndex);
+            int prevIndex = foundIndex + 1;
+            if (beforeHalfGlyphIds.contains(glyph))
+            {
+                repositionedGlyphIds.remove(foundIndex);
+                repositionedGlyphIds.add(nextIndex--, glyph);
+            }
+            else if (rephGlyphIds.get(1).equals(glyph) && prevIndex < listSize)
+            {
+                int prevGlyph = repositionedGlyphIds.get(prevIndex);
+                if (beforeHalfGlyphIds.contains(prevGlyph))
+                {
+                    repositionedGlyphIds.remove(prevIndex);
+                    repositionedGlyphIds.add(nextIndex--, prevGlyph);
+                }
+            }
+            foundIndex = nextIndex--;
+        }
+        return repositionedGlyphIds;
+    }
+
+    private List<Integer> applyGsubFeature(ScriptFeature scriptFeature, List<Integer> originalGlyphs)
+    {
+        Set<List<Integer>> allGlyphIdsForSubstitution = scriptFeature.getAllGlyphIdsForSubstitution();
+        if (allGlyphIdsForSubstitution.isEmpty())
+        {
+            LOG.debug("getAllGlyphIdsForSubstitution() for {} is empty", scriptFeature.getName());
+            return originalGlyphs;
+        }
+        GlyphArraySplitter glyphArraySplitter = new GlyphArraySplitterRegexImpl(
+                allGlyphIdsForSubstitution);
+        List<List<Integer>> tokens = glyphArraySplitter.split(originalGlyphs);
+        List<Integer> gsubProcessedGlyphs = new ArrayList<>(tokens.size());
+        tokens.forEach(chunk ->
+        {
+            if (scriptFeature.canReplaceGlyphs(chunk))
+            {
+                Integer glyphId = scriptFeature.getReplacementForGlyphs(chunk);
+                gsubProcessedGlyphs.add(glyphId);
+            }
+            else
+            {
+                gsubProcessedGlyphs.addAll(chunk);
+            }
+        });
+        LOG.debug("originalGlyphs: {}, gsubProcessedGlyphs: {}", originalGlyphs, gsubProcessedGlyphs);
+        return gsubProcessedGlyphs;
+    }
+
+    private List<Integer> getBeforeHalfGlyphIds()
+    {
+        List<Integer> glyphIds = new ArrayList<>();
+        glyphIds.add(getGlyphId(BEFORE_HALF_CHAR));
+        return Collections.unmodifiableList(glyphIds);
+    }
+
+    private List<Integer> getRephGlyphIds()
+    {
+        List<Integer> result = new ArrayList<>();
+        for (char character : REPH_CHARS)
+        {
+            result.add(getGlyphId(character));
+        }
+        return Collections.unmodifiableList(result);
+    }
+
+    private List<Integer> getbeforeRephGlyphIds()
+    {
+        List<Integer> glyphIds = new ArrayList<>();
+        for (char character : BEFORE_REPH_CHARS)
+        {
+            glyphIds.add(getGlyphId(character));
+        }
+        return Collections.unmodifiableList(glyphIds);
+    }
+
+    private Integer getGlyphId(char character)
+    {
+        return cmapLookup.getGlyphId(character);
+    }
+}
diff --git a/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java b/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
@@ -19,8 +19,11 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
@@ -45,6 +48,7 @@
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.parallel.Execution;
@@ -245,6 +249,52 @@ void testBengali() throws IOException
         //assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim());
     }
 
+    @Test
+    void testDevanagari() throws IOException
+    {
+        String DEVANAGARI_TEXT = "प्रदेश ग्रामीण व्यवसायिक, लक्ष्मिपति, लक्षित, मक्खि उपलब्धि, प्रसिद्धि";
+
+        String expectedExtractedtext = DEVANAGARI_TEXT;
+        File pdf = new File(OUT_DIR, "Devanagari.pdf");
+
+        try (PDDocument document = new PDDocument())
+        {
+            PDPage page = new PDPage(PDRectangle.A4);
+            document.addPage(page);
+            PDFont font = PDType0Font.load(document, 
+                    this.getClass().getResourceAsStream("/org/apache/pdfbox/ttf/Lohit-Devanagari.ttf"));
+
+            try (PDPageContentStream contentStream = new PDPageContentStream(document, page))
+            {
+                contentStream.beginText();
+                contentStream.setFont(font, 20);
+                contentStream.newLineAtOffset(50, 700);
+                contentStream.showText(DEVANAGARI_TEXT);
+                contentStream.endText();
+            }
+
+            document.save(pdf);
+        }
+
+        File IN_DIR = new File("src/test/resources/org/apache/pdfbox/ttf");
+
+        // compare rendering
+        if (!TestPDFToImage.doTestFile(pdf, IN_DIR.getAbsolutePath(), OUT_DIR.getAbsolutePath()))
+        {
+            // don't fail, rendering is different on different systems, result must be viewed manually
+            fail("Rendering of " + pdf + " failed or is not identical to expected rendering in " + IN_DIR + " directory");
+        }
+
+        // Check text extraction
+        String extracted = getUnicodeText(pdf);
+
+        try (OutputStream os = new FileOutputStream(new File(OUT_DIR, "Devanagari.txt")))
+        {
+            os.write(extracted.getBytes(StandardCharsets.UTF_8));
+            //assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim());
+        }
+    }
+
     /**
      * Test corner case of PDFBOX-4302.
      *