diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java index afcab567af6..e384314d41f 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerFactory.java @@ -21,6 +21,9 @@ import org.apache.fontbox.ttf.model.GsubData; import org.apache.fontbox.ttf.model.Language; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + /** * Gets a {@link Language} specific instance of a {@link GsubWorker} * @@ -29,19 +32,25 @@ */ public class GsubWorkerFactory { + private static final Logger LOG = LogManager.getLogger(GsubWorkerFactory.class); public GsubWorker getGsubWorker(CmapLookup cmapLookup, GsubData gsubData) { + //TODO this needs to be redesigned / improved because if a font supports several languages, + // it will choose one of them and maybe not the one expected. + LOG.debug("Language: {}", gsubData.getLanguage()); switch (gsubData.getLanguage()) { case BENGALI: return new GsubWorkerForBengali(cmapLookup, gsubData); + case DEVANAGARI: + return new GsubWorkerForDevanagari(cmapLookup, gsubData); + //case GUJARATI: + // return new GsubWorkerForGujarati(cmapLookup, gsubData); case LATIN: return new GsubWorkerForLatin(cmapLookup, gsubData); default: return new DefaultGsubWorker(); } - } - } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerForDevanagari.java b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerForDevanagari.java new file mode 100644 index 00000000000..3e35988873f --- /dev/null +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/gsub/GsubWorkerForDevanagari.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fontbox.ttf.gsub; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.apache.fontbox.ttf.CmapLookup; +import org.apache.fontbox.ttf.model.GsubData; +import org.apache.fontbox.ttf.model.ScriptFeature; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +/** + * + * Devanagari-specific implementation of GSUB system + * + * @author JAVAUSER + * + */ +public class GsubWorkerForDevanagari implements GsubWorker +{ + private static final Logger LOG = LogManager.getLogger(GsubWorkerForDevanagari.class); + + private static final String RKRF_FEATURE = "rkrf"; + private static final String VATU_FEATURE = "vatu"; + + /** + * This sequence is very important. This has been taken from https://docs.microsoft.com/en-us/typography/script-development/devanagari + */ + private static final List FEATURES_IN_ORDER = Arrays.asList("locl", "nukt", "akhn", + "rphf", RKRF_FEATURE,"blwf", "half", VATU_FEATURE, "cjct", "pres", "abvs", "blws", + "psts", "haln", "calt"); + + // Reph glyphs + private static final char[] REPH_CHARS = {'\u0930', '\u094D'}; + // Glyphs to precede reph + private static final char[] BEFORE_REPH_CHARS={'\u093E','\u0940'}; + + // Devanagari vowel sign I + private static final char BEFORE_HALF_CHAR = '\u093F'; + + private final CmapLookup cmapLookup; + private final GsubData gsubData; + + private final List rephGlyphIds; + private final List beforeRephGlyphIds; + private final List beforeHalfGlyphIds; + + GsubWorkerForDevanagari(CmapLookup cmapLookup, GsubData gsubData) + { + this.cmapLookup = cmapLookup; + this.gsubData = gsubData; + beforeHalfGlyphIds = getBeforeHalfGlyphIds(); + rephGlyphIds = getRephGlyphIds(); + beforeRephGlyphIds = getbeforeRephGlyphIds(); + } + + @Override + public List applyTransforms(List originalGlyphIds) + { + List intermediateGlyphsFromGsub = adjustRephPosition(originalGlyphIds); + intermediateGlyphsFromGsub = repositionGlyphs(intermediateGlyphsFromGsub); + for (String feature : FEATURES_IN_ORDER) + { + if (!gsubData.isFeatureSupported(feature)) + { + if (feature.equals(RKRF_FEATURE) && gsubData.isFeatureSupported(VATU_FEATURE)) + { + // Create your own rkrf feature from vatu feature + intermediateGlyphsFromGsub = applyRKRFFeature( + gsubData.getFeature(VATU_FEATURE), + intermediateGlyphsFromGsub); + } + LOG.debug("the feature {} was not found", feature); + continue; + } + + LOG.debug("applying the feature {}", feature); + ScriptFeature scriptFeature = gsubData.getFeature(feature); + intermediateGlyphsFromGsub = applyGsubFeature(scriptFeature, + intermediateGlyphsFromGsub); + } + return Collections.unmodifiableList(intermediateGlyphsFromGsub); + } + + private List applyRKRFFeature(ScriptFeature rkrfGlyphsForSubstitution, + List originalGlyphIds) + { + Set> rkrfGlyphIds = rkrfGlyphsForSubstitution.getAllGlyphIdsForSubstitution(); + if (rkrfGlyphIds.isEmpty()) + { + LOG.debug("Glyph substitution list for {} is empty.", rkrfGlyphsForSubstitution.getName()); + return originalGlyphIds; + } + // Replace this with better implementation to get second GlyphId from rkrfGlyphIds + int rkrfReplacement = 0; + for (List firstList : rkrfGlyphIds) + { + if (firstList.size() > 1) + { + rkrfReplacement = firstList.get(1); + break; + } + } + + if (rkrfReplacement == 0) + { + LOG.debug("Cannot find rkrf candidate. The rkrfGlyphIds doesn't contain lists of two elements."); + return originalGlyphIds; + } + + List rkrfList = new ArrayList<>(originalGlyphIds); + for (int index = originalGlyphIds.size() - 1; index > 1; index--) + { + int raGlyph = originalGlyphIds.get(index); + if (raGlyph == rephGlyphIds.get(0)) + { + int viramaGlyph = originalGlyphIds.get(index - 1); + if (viramaGlyph == rephGlyphIds.get(1)) + { + rkrfList.set(index - 1, rkrfReplacement); + rkrfList.remove(index); + } + } + } + return rkrfList; + } + + private List adjustRephPosition(List originalGlyphIds) + { + List rephAdjustedList = new ArrayList<>(originalGlyphIds); + for (int index = 0; index < originalGlyphIds.size() - 2; index++) + { + int raGlyph = originalGlyphIds.get(index); + int viramaGlyph = originalGlyphIds.get(index + 1); + if (raGlyph == rephGlyphIds.get(0) && viramaGlyph == rephGlyphIds.get(1)) + { + int nextConsonantGlyph = originalGlyphIds.get(index + 2); + rephAdjustedList.set(index, nextConsonantGlyph); + rephAdjustedList.set(index + 1, raGlyph); + rephAdjustedList.set(index + 2, viramaGlyph); + + if (index + 3 < originalGlyphIds.size()) + { + int matraGlyph = originalGlyphIds.get(index + 3); + if (beforeRephGlyphIds.contains(matraGlyph)) + { + rephAdjustedList.set(index + 1, matraGlyph); + rephAdjustedList.set(index + 2, raGlyph); + rephAdjustedList.set(index + 3, viramaGlyph); + } + } + } + } + return rephAdjustedList; + } + + private List repositionGlyphs(List originalGlyphIds) + { + List repositionedGlyphIds = new ArrayList<>(originalGlyphIds); + int listSize = repositionedGlyphIds.size(); + int foundIndex = listSize - 1; + int nextIndex = listSize - 2; + while (nextIndex > -1) + { + int glyph = repositionedGlyphIds.get(foundIndex); + int prevIndex = foundIndex + 1; + if (beforeHalfGlyphIds.contains(glyph)) + { + repositionedGlyphIds.remove(foundIndex); + repositionedGlyphIds.add(nextIndex--, glyph); + } + else if (rephGlyphIds.get(1).equals(glyph) && prevIndex < listSize) + { + int prevGlyph = repositionedGlyphIds.get(prevIndex); + if (beforeHalfGlyphIds.contains(prevGlyph)) + { + repositionedGlyphIds.remove(prevIndex); + repositionedGlyphIds.add(nextIndex--, prevGlyph); + } + } + foundIndex = nextIndex--; + } + return repositionedGlyphIds; + } + + private List applyGsubFeature(ScriptFeature scriptFeature, List originalGlyphs) + { + Set> allGlyphIdsForSubstitution = scriptFeature.getAllGlyphIdsForSubstitution(); + if (allGlyphIdsForSubstitution.isEmpty()) + { + LOG.debug("getAllGlyphIdsForSubstitution() for {} is empty", scriptFeature.getName()); + return originalGlyphs; + } + GlyphArraySplitter glyphArraySplitter = new GlyphArraySplitterRegexImpl( + allGlyphIdsForSubstitution); + List> tokens = glyphArraySplitter.split(originalGlyphs); + List gsubProcessedGlyphs = new ArrayList<>(tokens.size()); + tokens.forEach(chunk -> + { + if (scriptFeature.canReplaceGlyphs(chunk)) + { + Integer glyphId = scriptFeature.getReplacementForGlyphs(chunk); + gsubProcessedGlyphs.add(glyphId); + } + else + { + gsubProcessedGlyphs.addAll(chunk); + } + }); + LOG.debug("originalGlyphs: {}, gsubProcessedGlyphs: {}", originalGlyphs, gsubProcessedGlyphs); + return gsubProcessedGlyphs; + } + + private List getBeforeHalfGlyphIds() + { + List glyphIds = new ArrayList<>(); + glyphIds.add(getGlyphId(BEFORE_HALF_CHAR)); + return Collections.unmodifiableList(glyphIds); + } + + private List getRephGlyphIds() + { + List result = new ArrayList<>(); + for (char character : REPH_CHARS) + { + result.add(getGlyphId(character)); + } + return Collections.unmodifiableList(result); + } + + private List getbeforeRephGlyphIds() + { + List glyphIds = new ArrayList<>(); + for (char character : BEFORE_REPH_CHARS) + { + glyphIds.add(getGlyphId(character)); + } + return Collections.unmodifiableList(glyphIds); + } + + private Integer getGlyphId(char character) + { + return cmapLookup.getGlyphId(character); + } +} diff --git a/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java b/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java index a6dd621e50f..74326e47fc1 100644 --- a/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java +++ b/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java @@ -19,8 +19,11 @@ import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.Map; import java.util.Set; @@ -45,6 +48,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; @@ -245,6 +249,52 @@ void testBengali() throws IOException //assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim()); } + @Test + void testDevanagari() throws IOException + { + String DEVANAGARI_TEXT = "प्रदेश ग्रामीण व्यवसायिक, लक्ष्मिपति, लक्षित, मक्खि उपलब्धि, प्रसिद्धि"; + + String expectedExtractedtext = DEVANAGARI_TEXT; + File pdf = new File(OUT_DIR, "Devanagari.pdf"); + + try (PDDocument document = new PDDocument()) + { + PDPage page = new PDPage(PDRectangle.A4); + document.addPage(page); + PDFont font = PDType0Font.load(document, + this.getClass().getResourceAsStream("/org/apache/pdfbox/ttf/Lohit-Devanagari.ttf")); + + try (PDPageContentStream contentStream = new PDPageContentStream(document, page)) + { + contentStream.beginText(); + contentStream.setFont(font, 20); + contentStream.newLineAtOffset(50, 700); + contentStream.showText(DEVANAGARI_TEXT); + contentStream.endText(); + } + + document.save(pdf); + } + + File IN_DIR = new File("src/test/resources/org/apache/pdfbox/ttf"); + + // compare rendering + if (!TestPDFToImage.doTestFile(pdf, IN_DIR.getAbsolutePath(), OUT_DIR.getAbsolutePath())) + { + // don't fail, rendering is different on different systems, result must be viewed manually + fail("Rendering of " + pdf + " failed or is not identical to expected rendering in " + IN_DIR + " directory"); + } + + // Check text extraction + String extracted = getUnicodeText(pdf); + + try (OutputStream os = new FileOutputStream(new File(OUT_DIR, "Devanagari.txt"))) + { + os.write(extracted.getBytes(StandardCharsets.UTF_8)); + //assertEquals(expectedExtractedtext, extracted.replaceAll("\r", "").trim()); + } + } + /** * Test corner case of PDFBOX-4302. *