From 5ec36b8be6574443f512904953e77b8854e9f181 Mon Sep 17 00:00:00 2001 From: Mykola Bohdiuk Date: Wed, 10 Apr 2024 02:36:31 +0300 Subject: [PATCH] [FIX] Memory leak: RandomAccessRead passed to TrueTypeCollection constructor [PERFORMANCE] Improve FileSystemFontProvider.scanFonts() performance by adding 'only headers' mode to TTF parser: * only read tables needed for FSFontInfo ('name', 'head', 'OS/2', 'CFF ', 'gcid') * 'CFF ' and 'head' table parsers finish as soon as it has all needed data (in 'only headers' mode) * streamline I/O: replace readByte() with read(array), avoid allocating byte[] where possible * NamingTable: use sorted list instead of multilevel HashMap, delay-load Strings * skip checksumming as it is now faster to simply re-parse (gated with "pdfbox.fontcache.skipchecksums" for backward compatibility) [DEV] Breaking change: NameRecord.getString() is now package-private and lazy, renamed to getStringLazy(). [DEV] Breaking change: new abstract method TTFDataStream.getSubReader() --- .../java/org/apache/fontbox/cff/CFFFont.java | 40 ++-- .../org/apache/fontbox/cff/CFFParser.java | 39 ++-- .../cff/DataInputRandomAccessRead.java | 11 +- .../java/org/apache/fontbox/ttf/CFFTable.java | 21 +- .../org/apache/fontbox/ttf/HeaderTable.java | 9 + .../apache/fontbox/ttf/LoadOnlyHeaders.java | 155 ++++++++++++++ .../org/apache/fontbox/ttf/NameRecord.java | 6 +- .../org/apache/fontbox/ttf/NamingTable.java | 191 ++++++++++++------ .../ttf/RandomAccessReadDataStream.java | 26 ++- .../RandomAccessReadUncachedDataStream.java | 191 ++++++++++++++++++ .../org/apache/fontbox/ttf/TTCDataStream.java | 6 + .../org/apache/fontbox/ttf/TTFDataStream.java | 13 ++ .../org/apache/fontbox/ttf/TTFParser.java | 112 +++++++++- .../org/apache/fontbox/ttf/TTFSubsetter.java | 12 +- .../fontbox/ttf/TrueTypeCollection.java | 57 +++++- .../org/apache/fontbox/ttf/TrueTypeFont.java | 36 ++++ .../apache/pdfbox/io/RandomAccessRead.java | 81 +++++++- .../apache/pdfbox/pdfparser/BaseParser.java | 5 +- .../apache/pdfbox/pdfparser/COSParser.java | 16 +- .../pdfbox/pdfparser/PDFStreamParser.java | 2 +- .../pdfbox/pdfparser/PDFXrefStreamParser.java | 12 +- .../pdmodel/font/FileSystemFontProvider.java | 160 +++++++-------- .../pdfbox/pdmodel/font/PDCIDFontType2.java | 9 +- .../pdfbox/pdmodel/font/PDFontFactory.java | 10 +- .../pdfbox/pdmodel/font/PDTrueTypeFont.java | 9 +- 25 files changed, 957 insertions(+), 272 deletions(-) create mode 100644 fontbox/src/main/java/org/apache/fontbox/ttf/LoadOnlyHeaders.java create mode 100644 fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadUncachedDataStream.java diff --git a/fontbox/src/main/java/org/apache/fontbox/cff/CFFFont.java b/fontbox/src/main/java/org/apache/fontbox/cff/CFFFont.java index 2e3818dbecd..5c6ad3ccfde 100644 --- a/fontbox/src/main/java/org/apache/fontbox/cff/CFFFont.java +++ b/fontbox/src/main/java/org/apache/fontbox/cff/CFFFont.java @@ -35,7 +35,7 @@ public abstract class CFFFont implements FontBoxFont { private String fontName; private CFFCharset charset; - private CFFParser.ByteSource source; +// private CFFParser.ByteSource source; protected final Map topDict = new LinkedHashMap<>(); protected byte[][] charStrings; protected byte[][] globalSubrIndex; @@ -140,25 +140,25 @@ public final List getCharStringBytes() return Arrays.asList(charStrings); } - /** - * Sets a byte source to re-read the CFF data in the future. - */ - final void setData(CFFParser.ByteSource source) - { - this.source = source; - } - - /** - * Returns the CFF data. - * - * @return the cff data as byte array - * - * @throws IOException if the data could not be read - */ - public byte[] getData() throws IOException - { - return source.getBytes(); - } +// /** +// * Sets a byte source to re-read the CFF data in the future. +// */ +// final void setData(CFFParser.ByteSource source) +// { +// this.source = source; +// } +// +// /** +// * Returns the CFF data. +// * +// * @return the cff data as byte array +// * +// * @throws IOException if the data could not be read +// */ +// public byte[] getData() throws IOException +// { +// return source.getBytes(); +// } /** * Returns the number of charstrings in the font. diff --git a/fontbox/src/main/java/org/apache/fontbox/cff/CFFParser.java b/fontbox/src/main/java/org/apache/fontbox/cff/CFFParser.java index 5a04ca2ba89..1f4705b88fa 100644 --- a/fontbox/src/main/java/org/apache/fontbox/cff/CFFParser.java +++ b/fontbox/src/main/java/org/apache/fontbox/cff/CFFParser.java @@ -28,6 +28,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.fontbox.ttf.LoadOnlyHeaders; import org.apache.pdfbox.io.RandomAccessRead; @@ -47,7 +48,8 @@ public class CFFParser private static final String TAG_TTFONLY = "\u0000\u0001\u0000\u0000"; private String[] stringIndex = null; - private ByteSource source; +// private ByteSource source; + private LoadOnlyHeaders loadOnlyHeaders; // for debugging only private String debugFontName; @@ -66,6 +68,11 @@ public interface ByteSource byte[] getBytes() throws IOException; } + public void setLoadOnlyHeaders(LoadOnlyHeaders loadOnlyHeaders) + { + this.loadOnlyHeaders = loadOnlyHeaders; + } + /** * Parse CFF font using byte array, also passing in a byte source for future use. * @@ -77,7 +84,7 @@ public interface ByteSource public List parse(byte[] bytes, ByteSource source) throws IOException { // TODO do we need to store the source data of the font? It isn't used at all - this.source = source; +// this.source = source; return parse(new DataInputByteArray(bytes)); } @@ -91,17 +98,10 @@ public List parse(byte[] bytes, ByteSource source) throws IOException public List parse(RandomAccessRead randomAccessRead) throws IOException { // TODO do we need to store the source data of the font? It isn't used at all - byte[] bytes = new byte[(int) randomAccessRead.length()]; - randomAccessRead.seek(0); - int remainingBytes = bytes.length; - int amountRead; - while ((amountRead = randomAccessRead.read(bytes, bytes.length - remainingBytes, - remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } randomAccessRead.seek(0); - this.source = new CFFBytesource(bytes); +// byte[] bytes = randomAccessRead.readNBytes((int) randomAccessRead.length()); +// randomAccessRead.seek(0); +// this.source = new CFFBytesource(bytes); return parse(new DataInputRandomAccessRead(randomAccessRead)); } @@ -151,7 +151,7 @@ private List parse(DataInput input) throws IOException { CFFFont font = parseFont(input, nameIndex[i], topDictIndex[i]); font.setGlobalSubrIndex(globalSubrIndex); - font.setData(source); +// font.setData(source); fonts.add(font); } return fonts; @@ -492,6 +492,15 @@ private CFFFont parseFont(DataInput input, String name, byte[] topDictIndex) thr cffCIDFont.setSupplement(rosEntry.getNumber(2).intValue()); font = cffCIDFont; + if (loadOnlyHeaders != null) + { + loadOnlyHeaders.setOtfROS( + cffCIDFont.getRegistry(), + cffCIDFont.getOrdering(), + cffCIDFont.getSupplement()); + // we just read (Registry, Ordering, Supplement) and don't need anything else + return font; + } } else { @@ -501,6 +510,10 @@ private CFFFont parseFont(DataInput input, String name, byte[] topDictIndex) thr // name debugFontName = name; font.setName(name); + if (loadOnlyHeaders != null) + { + return font; // not a 'CFFCIDFont' => cannot read properties needed by LoadOnlyHeaders + } // top dict font.addValueToTopDict("version", getString(topDict, "version")); diff --git a/fontbox/src/main/java/org/apache/fontbox/cff/DataInputRandomAccessRead.java b/fontbox/src/main/java/org/apache/fontbox/cff/DataInputRandomAccessRead.java index 65460969aac..79becff9e52 100644 --- a/fontbox/src/main/java/org/apache/fontbox/cff/DataInputRandomAccessRead.java +++ b/fontbox/src/main/java/org/apache/fontbox/cff/DataInputRandomAccessRead.java @@ -170,16 +170,7 @@ public byte[] readBytes(int length) throws IOException { throw new IOException("length is negative"); } - if (randomAccessRead.length() - randomAccessRead.getPosition() < length) - { - throw new IOException("Premature end of buffer reached"); - } - byte[] bytes = new byte[length]; - for (int i = 0; i < length; i++) - { - bytes[i] = readByte(); - } - return bytes; + return randomAccessRead.readExact(length); } @Override diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/CFFTable.java b/fontbox/src/main/java/org/apache/fontbox/ttf/CFFTable.java index 7d6d14cae0c..9551d7e6e63 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/CFFTable.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/CFFTable.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.fontbox.cff.CFFFont; import org.apache.fontbox.cff.CFFParser; +import org.apache.pdfbox.io.RandomAccessRead; /** * PostScript font program (compact font format). @@ -48,10 +49,24 @@ public class CFFTable extends TTFTable @Override void read(TrueTypeFont ttf, TTFDataStream data) throws IOException { - byte[] bytes = data.read((int)getLength()); - CFFParser parser = new CFFParser(); - cffFont = parser.parse(bytes, new CFFBytesource(ttf)).get(0); + parser.setLoadOnlyHeaders(ttf.getLoadOnlyHeaders()); +// assert data.getCurrentPosition() == getOffset(); + try (RandomAccessRead subReader = data.getSubReader(getLength())) + { + if (subReader != null) + { + cffFont = parser.parse(subReader).get(0); + data.seek(getOffset() + getLength()); + } + else + { + assert ttf.getLoadOnlyHeaders() == null + : "It is inefficient to read whole CFF table to parse only headers, please use RandomAccessReadUncachedDataStream"; + byte[] bytes = data.read((int)getLength()); + cffFont = parser.parse(bytes, new CFFBytesource(ttf)).get(0); + } + } initialized = true; } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/HeaderTable.java b/fontbox/src/main/java/org/apache/fontbox/ttf/HeaderTable.java index 4d19475db31..b75c164c30b 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/HeaderTable.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/HeaderTable.java @@ -74,6 +74,15 @@ public class HeaderTable extends TTFTable @Override void read(TrueTypeFont ttf, TTFDataStream data) throws IOException { + LoadOnlyHeaders outHeaders = ttf.getLoadOnlyHeaders(); + if (outHeaders != null) { + data.skip(44); + macStyle = data.readUnsignedShort(); + outHeaders.setHeaderMacStyle(macStyle); + initialized = true; + return; + } + version = data.read32Fixed(); fontRevision = data.read32Fixed(); checkSumAdjustment = data.readUnsignedInt(); diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/LoadOnlyHeaders.java b/fontbox/src/main/java/org/apache/fontbox/ttf/LoadOnlyHeaders.java new file mode 100644 index 00000000000..e298c7914ec --- /dev/null +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/LoadOnlyHeaders.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.fontbox.ttf; + +import java.io.IOException; + +/** + * To improve performance of {@code FileSystemFontProvider.scanFonts(...)}, + * this class is used both as a marker to skip + * unused data and as a storage for collected data. + *

+ * Tables it needs:

    + *
  • NamingTable.TAG, + *
  • HeaderTable.TAG, + *
  • OS2WindowsMetricsTable.TAG, + *
  • OTF: CFFTable.TAG + *
  • non-OTF: "gcid" + *
+ * + * @author Mykola Bohdiuk + */ +public final class LoadOnlyHeaders +{ + static final int BYTES_GCID = 142; + + private IOException exception; + private String name; + private Integer headerMacStyle; + private OS2WindowsMetricsTable os2Windows; + private String fontFamily; + private String fontSubFamily; + private byte[] nonOtfGcid142; + // + private boolean isOTFAndPostScript; + private String otfRegistry; + private String otfOrdering; + private int otfSupplement; + + public IOException getException() + { + return exception; + } + + public String getName() + { + return name; + } + + /** + * null == no HeaderTable, {@code ttf.getHeader().getMacStyle()} + */ + public Integer getHeaderMacStyle() + { + return headerMacStyle; + } + + public OS2WindowsMetricsTable getOS2Windows() + { + return os2Windows; + } + + // only when LOGGER(FileSystemFontProvider).isTraceEnabled() tracing: FontFamily, FontSubfamily + public String getFontFamily() + { + return fontFamily; + } + + public String getFontSubFamily() + { + return fontSubFamily; + } + + public boolean isOpenTypePostScript() + { + return isOTFAndPostScript; + } + + public byte[] getNonOtfTableGCID142() + { + return nonOtfGcid142; + } + + public String getOtfRegistry() + { + return otfRegistry; + } + + public String getOtfOrdering() + { + return otfOrdering; + } + + public int getOtfSupplement() + { + return otfSupplement; + } + + void setException(IOException exception) + { + this.exception = exception; + } + + void setName(String name) + { + this.name = name; + } + + void setHeaderMacStyle(Integer headerMacStyle) + { + this.headerMacStyle = headerMacStyle; + } + + void setOs2Windows(OS2WindowsMetricsTable os2Windows) + { + this.os2Windows = os2Windows; + } + + void setFontFamily(String fontFamily, String fontSubFamily) + { + this.fontFamily = fontFamily; + this.fontSubFamily = fontSubFamily; + } + + void setNonOtfGcid142(byte[] nonOtfGcid142) + { + this.nonOtfGcid142 = nonOtfGcid142; + } + + void setIsOTFAndPostScript(boolean isOTFAndPostScript) + { + this.isOTFAndPostScript = isOTFAndPostScript; + } + + // public because CFFParser is in a different package + public void setOtfROS(String otfRegistry, String otfOrdering, int otfSupplement) + { + this.otfRegistry = otfRegistry; + this.otfOrdering = otfOrdering; + this.otfSupplement = otfSupplement; + } +} diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/NameRecord.java b/fontbox/src/main/java/org/apache/fontbox/ttf/NameRecord.java index 982bff6ff3f..f6c747bc287 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/NameRecord.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/NameRecord.java @@ -180,6 +180,7 @@ void initData( TrueTypeFont ttf, TTFDataStream data ) throws IOException * * @return A string for this class. */ + @Override public String toString() { return @@ -190,9 +191,10 @@ public String toString() " " + string; } /** - * @return Returns the string. + * Use {@link NamingTable#getString(NameRecord)} + * @return Returns the string, if it was pre-loaded. */ - public String getString() + String getStringLazy() { return string; } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/NamingTable.java b/fontbox/src/main/java/org/apache/fontbox/ttf/NamingTable.java index b8402e56819..3f69a0ab2dc 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/NamingTable.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/NamingTable.java @@ -20,9 +20,9 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Arrays; +import java.util.Collections; import java.util.List; -import java.util.Map; /** * This 'name'-table is a required table in a TrueType font. @@ -38,11 +38,10 @@ public class NamingTable extends TTFTable private List nameRecords; - private Map>>> lookupTable; - private String fontFamily = null; private String fontSubFamily = null; private String psName = null; + private byte[] stringTable; NamingTable() { @@ -63,70 +62,79 @@ void read(TrueTypeFont ttf, TTFDataStream data) throws IOException int numberOfNameRecords = data.readUnsignedShort(); int offsetToStartOfStringStorage = data.readUnsignedShort(); nameRecords = new ArrayList<>(numberOfNameRecords); + LoadOnlyHeaders onlyHeaders = ttf.getLoadOnlyHeaders(); for (int i=0; i< numberOfNameRecords; i++) { NameRecord nr = new NameRecord(); nr.initData(ttf, data); - nameRecords.add(nr); - } - - for (NameRecord nr : nameRecords) - { - // don't try to read invalid offsets, see PDFBOX-2608 - if (nr.getStringOffset() > getLength()) - { - nr.setString(null); - continue; - } - - data.seek(getOffset() + (2L*3)+numberOfNameRecords*2L*6+nr.getStringOffset()); - int platform = nr.getPlatformId(); - int encoding = nr.getPlatformEncodingId(); - Charset charset = StandardCharsets.ISO_8859_1; - if (platform == NameRecord.PLATFORM_WINDOWS && (encoding == NameRecord.ENCODING_WINDOWS_SYMBOL || encoding == NameRecord.ENCODING_WINDOWS_UNICODE_BMP)) + if (onlyHeaders == null || isUsefulForOnlyHeaders(nr)) { - charset = StandardCharsets.UTF_16; + nameRecords.add(nr); } - else if (platform == NameRecord.PLATFORM_UNICODE) - { - charset = StandardCharsets.UTF_16; - } - else if (platform == NameRecord.PLATFORM_ISO) + } + final long stringsStart = data.getCurrentPosition(); // == getOffset() + (2L*3)+numberOfNameRecords*2L*6 + stringTable = onlyHeaders == null + ? data.read((int) (getOffset() + getLength() - stringsStart)) + : null; + if (onlyHeaders != null) + { + // preload strings - profiler shows that it is faster than using 'stringTable' or 'data.readString' on demand + for (NameRecord nr : nameRecords) { - switch (encoding) + // don't try to read invalid offsets, see PDFBOX-2608 + if (nr.getStringOffset() > getLength()) { - case 0: - charset = StandardCharsets.US_ASCII; - break; - case 1: - //not sure is this is correct?? - charset = StandardCharsets.UTF_16BE; - break; - case 2: - charset = StandardCharsets.ISO_8859_1; - break; - default: - break; + nr.setString(null); + continue; } + + data.seek(stringsStart + nr.getStringOffset()); + String string = data.readString(nr.getStringLength(), getCharset(nr)); + nr.setString(string); } - String string = data.readString(nr.getStringLength(), charset); - nr.setString(string); } - // build multi-dimensional lookup table - lookupTable = new HashMap<>(nameRecords.size()); - for (NameRecord nr : nameRecords) + // sort to be able to binarySearch() + Collections.sort(nameRecords, NamingTable::compareWithoutString); + + readInterestingStrings(onlyHeaders); + initialized = true; + } + + private Charset getCharset(NameRecord nr) { + int platform = nr.getPlatformId(); + int encoding = nr.getPlatformEncodingId(); + Charset charset = StandardCharsets.ISO_8859_1; + if (platform == NameRecord.PLATFORM_WINDOWS && (encoding == NameRecord.ENCODING_WINDOWS_SYMBOL || encoding == NameRecord.ENCODING_WINDOWS_UNICODE_BMP)) + { + charset = StandardCharsets.UTF_16; + } + else if (platform == NameRecord.PLATFORM_UNICODE) + { + charset = StandardCharsets.UTF_16; + } + else if (platform == NameRecord.PLATFORM_ISO) { - // name id - Map>> platformLookup = lookupTable.computeIfAbsent(nr.getNameId(), k -> new HashMap<>()); - // platform id - Map> encodingLookup = platformLookup.computeIfAbsent(nr.getPlatformId(), k -> new HashMap<>()); - // encoding id - Map languageLookup = encodingLookup.computeIfAbsent(nr.getPlatformEncodingId(), k -> new HashMap<>(1)); - // language id / string - languageLookup.put(nr.getLanguageId(), nr.getString()); + switch (encoding) + { + case 0: + charset = StandardCharsets.US_ASCII; + break; + case 1: + //not sure is this is correct?? + charset = StandardCharsets.UTF_16BE; + break; + case 2: + charset = StandardCharsets.ISO_8859_1; + break; + default: + break; + } } + return charset; + } + private void readInterestingStrings(LoadOnlyHeaders onlyHeaders) { // extract strings of interest fontFamily = getEnglishName(NameRecord.NAME_FONT_FAMILY_NAME); fontSubFamily = getEnglishName(NameRecord.NAME_FONT_SUB_FAMILY_NAME); @@ -148,7 +156,43 @@ else if (platform == NameRecord.PLATFORM_ISO) psName = psName.trim(); } - initialized = true; + if (onlyHeaders != null) { + onlyHeaders.setName(psName); + onlyHeaders.setFontFamily(fontFamily, fontSubFamily); + } + } + + private static boolean isUsefulForOnlyHeaders(NameRecord nr) + { + int nameId = nr.getNameId(); + // see "psName =" and "getEnglishName()" + if (nameId == NameRecord.NAME_POSTSCRIPT_NAME + || nameId == NameRecord.NAME_FONT_FAMILY_NAME + || nameId == NameRecord.NAME_FONT_SUB_FAMILY_NAME) + { + int languageId = nr.getLanguageId(); + return languageId == NameRecord.LANGUAGE_UNICODE + || languageId == NameRecord.LANGUAGE_WINDOWS_EN_US; + } + return false; + } + + private static int compareWithoutString(NameRecord left, NameRecord right) + { + int result = Integer.compare(left.getNameId(), right.getNameId()); + if (result == 0) + { + result = Integer.compare(left.getPlatformId(), right.getPlatformId()); + } + if (result == 0) + { + result = Integer.compare(left.getPlatformEncodingId(), right.getPlatformEncodingId()); + } + if (result == 0) + { + result = Integer.compare(left.getLanguageId(), right.getLanguageId()); + } + return result; } /** @@ -199,22 +243,39 @@ private String getEnglishName(int nameId) */ public String getName(int nameId, int platformId, int encodingId, int languageId) { - Map>> platforms = lookupTable.get(nameId); - if (platforms == null) - { - return null; - } - Map> encodings = platforms.get(platformId); - if (encodings == null) + final NameRecord match = new NameRecord(); + match.setNameId(nameId); + match.setPlatformId(platformId); + match.setPlatformEncodingId(encodingId); + match.setLanguageId(languageId); + final int found = Collections.binarySearch(nameRecords, match, NamingTable::compareWithoutString); + return found < 0 ? null : getString(nameRecords.get(found)); + } + + public byte[] getStringBytes(NameRecord nr) + { + final int start = nr.getStringOffset(); + final int end = start + nr.getStringLength(); + return start < stringTable.length + ? Arrays.copyOfRange(stringTable, start, Math.min(end, stringTable.length)) + // don't try to read invalid offsets, see PDFBOX-2608 + : null; + } + + public String getString(NameRecord nr) + { + // don't try to read invalid offsets, see PDFBOX-2608 + if (nr == null || nr.getStringOffset() >= getLength()) { return null; } - Map languages = encodings.get(encodingId); - if (languages == null) + String preloaded = nr.getStringLazy(); + if (preloaded == null) { - return null; + preloaded = new String(stringTable, nr.getStringOffset(), nr.getStringLength(), getCharset(nr)); + nr.setString(preloaded); } - return languages.get(languageId); + return preloaded; } /** diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadDataStream.java b/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadDataStream.java index 8e4b8aa1f70..73b8dde5c9b 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadDataStream.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadDataStream.java @@ -19,9 +19,12 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.logging.Level; +import java.util.logging.Logger; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.io.RandomAccessReadBuffer; /** * An implementation of the TTFDataStream using RandomAccessRead as source. @@ -37,21 +40,15 @@ class RandomAccessReadDataStream extends TTFDataStream /** * Constructor. * - * @param randomAccessRead source to be read from + * @param randomAccessRead source to be read from, is read to {@code byte[]} and automatically closed * * @throws IOException If there is a problem reading the source data. */ RandomAccessReadDataStream(RandomAccessRead randomAccessRead) throws IOException { length = randomAccessRead.length(); - data = new byte[(int) length]; - int remainingBytes = data.length; - int amountRead; - while ((amountRead = randomAccessRead.read(data, data.length - remainingBytes, - remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } + data = randomAccessRead.readNBytes((int) length); + IOUtils.closeQuietly(randomAccessRead); } /** @@ -191,4 +188,15 @@ public long getOriginalDataSize() { return length; } + + @Override + public RandomAccessRead getSubReader(long length) + { + try { + return new RandomAccessReadBuffer(data).createView(currentPosition, length); + } catch (IOException ex) { + Logger.getLogger(RandomAccessReadDataStream.class.getName()).log(Level.SEVERE, null, ex); + return null; + } + } } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadUncachedDataStream.java b/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadUncachedDataStream.java new file mode 100644 index 00000000000..cf2f4a84485 --- /dev/null +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/RandomAccessReadUncachedDataStream.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.fontbox.ttf; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.io.RandomAccessReadView; + +/** + * In contrast to {@link RandomAccessReadDataStream}, + * this class doesn't pre-load {@code RandomAccessRead} into a {@code byte[]}, + * it works with the buffer directly. + * + * Performance: it may be slower if whole buffer is read via read() operations, + * but much faster if most of the buffer is skipped + * + * @author Mykola Bohdiuk + */ +class RandomAccessReadUncachedDataStream extends TTFDataStream +{ + private final long length; + private final RandomAccessRead randomAccessRead; + + /** + * @throws IOException If there is a problem reading the source length. + */ + RandomAccessReadUncachedDataStream(RandomAccessRead randomAccessRead) throws IOException + { + this.length = randomAccessRead.length(); + this.randomAccessRead = randomAccessRead; + } + + /** + * {@inheritDoc} + */ + @Override + public long getCurrentPosition() throws IOException + { + return randomAccessRead.getPosition(); + } + + /** + * Close the underlying resources. + * + * @throws IOException If there is an error closing the resources. + */ + @Override + public void close() throws IOException + { + randomAccessRead.close(); + } + + /** + * {@inheritDoc} + */ + @Override + public int read() throws IOException + { + return randomAccessRead.read(); + } + + /** + * {@inheritDoc} + */ + @Override + public final long readLong() throws IOException + { + return ((long) readInt() << 32) | (readInt() & 0xFFFFFFFFL); + } + + /** + * {@inheritDoc} + */ + private int readInt() throws IOException + { + int b1 = read(); + int b2 = read(); + int b3 = read(); + int b4 = read(); + return (b1 << 24) | (b2 << 16) | (b3 << 8) | b4; + } + + /** + * {@inheritDoc} + */ + @Override + public void seek(long pos) throws IOException + { + randomAccessRead.seek(pos); + } + + /** + * {@inheritDoc} + */ + @Override + public int read(byte[] b, int off, int len) throws IOException + { + randomAccessRead.readExact(b, off, len); + return len; + } + + /** + * Lifetime of returned InputStream is bound by {@code this} lifetime, it won't close underlying {@code RandomAccessRead}. + * + * {@inheritDoc} + */ + @Override + public InputStream getOriginalData() throws IOException + { + return new RandomAccessReadNonClosingInputStream(randomAccessRead.createView(0, length)); + } + + /** + * {@inheritDoc} + */ + @Override + public long getOriginalDataSize() + { + return length; + } + + @Override + public RandomAccessRead getSubReader(long length) + { + try + { + return randomAccessRead.createView(randomAccessRead.getPosition(), length); + } + catch (IOException ex) + { + assert false : "Please implement " + randomAccessRead.getClass() + ".createView()"; + return null; + } + } + + private static final class RandomAccessReadNonClosingInputStream extends InputStream { + + private final RandomAccessReadView randomAccessRead; + + public RandomAccessReadNonClosingInputStream(RandomAccessReadView randomAccessRead) + { + this.randomAccessRead = randomAccessRead; + } + + @Override + public int read() throws IOException + { + return randomAccessRead.read(); + } + + @Override + public int read(byte[] b) throws IOException + { + return randomAccessRead.read(b); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException + { + return randomAccessRead.read(b, off, len); + } + + @Override + public long skip(long n) throws IOException + { + randomAccessRead.seek(randomAccessRead.getPosition() + n); + return n; + } + + @Override + public void close() throws IOException { + // WARNING: .close() will close RandomAccessReadMemoryMappedFile if this View was based on it +// randomAccessRead.close(); + } + } +} diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TTCDataStream.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TTCDataStream.java index bcdae654406..ba6fdd0f8c8 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TTCDataStream.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TTCDataStream.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.InputStream; +import org.apache.pdfbox.io.RandomAccessRead; /** * A wrapper for a TTF stream inside a TTC file, does not close the underlying shared stream. @@ -83,4 +84,9 @@ public long getOriginalDataSize() return stream.getOriginalDataSize(); } + @Override + public RandomAccessRead getSubReader(long length) + { + return stream.getSubReader(length); + } } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFDataStream.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFDataStream.java index e39de112e5d..cfa70463f1c 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFDataStream.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFDataStream.java @@ -24,6 +24,7 @@ import java.nio.charset.StandardCharsets; import java.util.Calendar; import java.util.TimeZone; +import org.apache.pdfbox.io.RandomAccessRead; /** * An abstract class to read a data stream. @@ -237,6 +238,10 @@ public String readTag() throws IOException */ public abstract void seek(long pos) throws IOException; + public void skip(int delta) throws IOException { + seek(getCurrentPosition() + delta); + } + /** * Read a specific number of bytes from the stream. * @@ -300,4 +305,12 @@ public byte[] read(int numberOfBytes) throws IOException * @return The size of the original data. */ public abstract long getOriginalDataSize(); + + /** + * {@code SubReader.close()} should never close {@code this} stream, only itself. + * Optional, caller can read {@code byte[]} instead. + * + * @return null if not supported. Please close() the result + */ + public abstract /*@Nullable*/ RandomAccessRead getSubReader(long length); } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFParser.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFParser.java index 07942b64b8a..b13badfe607 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFParser.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFParser.java @@ -20,6 +20,7 @@ import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; /** @@ -32,6 +33,7 @@ public class TTFParser private static final Log LOG = LogFactory.getLog(TTFParser.class); private boolean isEmbedded = false; + private LoadOnlyHeaders loadOnlyHeaders; /** * Constructor. @@ -60,7 +62,9 @@ public TTFParser(boolean isEmbedded) */ public TrueTypeFont parse(RandomAccessRead randomAccessRead) throws IOException { - RandomAccessReadDataStream dataStream = new RandomAccessReadDataStream(randomAccessRead); + TTFDataStream dataStream = loadOnlyHeaders != null + ? new RandomAccessReadUncachedDataStream(randomAccessRead) + : new RandomAccessReadDataStream(randomAccessRead); try { return parse(dataStream); @@ -115,6 +119,7 @@ TrueTypeFont parse(TTFDataStream raf) throws IOException { TrueTypeFont font = newFont(raf); font.setVersion(raf.read32Fixed()); + font.setLoadOnlyHeaders(loadOnlyHeaders); int numberOfTables = raf.readUnsignedShort(); int searchRange = raf.readUnsignedShort(); int entrySelector = raf.readUnsignedShort(); @@ -140,9 +145,21 @@ TrueTypeFont parse(TTFDataStream raf) throws IOException } } } - // parse tables - parseTables(font); - return font; + if (loadOnlyHeaders == null) + { + parseTables(font); + return font; + } + else + { + parseTableHeaders(font, loadOnlyHeaders); + return null; + } + } + + public void setLoadOnlyHeaders(LoadOnlyHeaders loadOnlyHeaders) + { + this.loadOnlyHeaders = loadOnlyHeaders; } TrueTypeFont newFont(TTFDataStream raf) @@ -227,6 +244,93 @@ else if (!isOTF) } } + /** + * Based on {@link #parseTables()} + * Parse all table headers and check if all needed tables are present. + * + * This method can be optimized further by skipping unused portions inside each individual table parser + * + * @param font the TrueTypeFont instance holding the parsed data. + * @throws IOException If there is an error parsing the TrueType font. + */ + private void parseTableHeaders(TrueTypeFont font, LoadOnlyHeaders outHeaders) throws IOException + { + try + { + font.getNaming(); // calls NamingTable.readTable(); + } + catch (IOException ex) + { + return; // ignore, empty name is reported differently than exception + } + try { + font.getHeader(); // calls HeaderTable.readTable(); + + // only these 5 are used + // sFamilyClass = os2WindowsMetricsTable.getFamilyClass(); + // usWeightClass = os2WindowsMetricsTable.getWeightClass(); + // ulCodePageRange1 = (int) os2WindowsMetricsTable.getCodePageRange1(); + // ulCodePageRange2 = (int) os2WindowsMetricsTable.getCodePageRange2(); + // panose = os2WindowsMetricsTable.getPanose(); + outHeaders.setOs2Windows(font.getOS2Windows()); + + boolean isOTFAndPostScript; + if (font instanceof OpenTypeFont && ((OpenTypeFont) font).isPostScript()) + { + isOTFAndPostScript = true; + if (((OpenTypeFont) font).isSupportedOTF()) + { + ((OpenTypeFont) font).getCFF(); // calls CFFTable.readTable(); + } + } + else + { + isOTFAndPostScript = false; + TTFTable gcid = font.getTableMap().get("gcid"); + if (gcid != null && gcid.getLength() >= LoadOnlyHeaders.BYTES_GCID) + { + outHeaders.setNonOtfGcid142(font.getTableNBytes(gcid, LoadOnlyHeaders.BYTES_GCID)); + } + } + outHeaders.setIsOTFAndPostScript(isOTFAndPostScript); + + boolean isOTF = font instanceof OpenTypeFont; + boolean isPostScript = isOTF ? isOTFAndPostScript : font.tables.containsKey(CFFTable.TAG); + + if (isPostScript && !isOTF) + { + loadOnlyHeaders.setException(new IOException("True Type fonts using CFF outlines are not supported")); + return; + } + + // list taken from parseTables(), detect them, but don't spend time parsing + final String[] mandatoryTables = { + HeaderTable.TAG, + HorizontalHeaderTable.TAG, + MaximumProfileTable.TAG, + isEmbedded ? null : PostScriptTable.TAG, // in an embedded font this table is optional + isPostScript ? null : IndexToLocationTable.TAG, + isPostScript ? null : GlyphTable.TAG, + isEmbedded ? null : NamingTable.TAG, + HorizontalMetricsTable.TAG, + isEmbedded ? null : CmapTable.TAG, + }; + + for (String tag : mandatoryTables) + { + if (tag != null && !font.tables.containsKey(tag)) + { + loadOnlyHeaders.setException(new IOException("'" + tag + "' table is mandatory")); + return; + } + } + } catch (IOException ex) { + loadOnlyHeaders.setException(ex); + } finally { + IOUtils.closeQuietly(font); + } + } + protected boolean allowCFF() { return false; diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java index 2474e6ff168..f0439e2f157 100755 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java @@ -24,6 +24,7 @@ import java.io.OutputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Calendar; import java.util.HashMap; import java.util.Iterator; @@ -335,12 +336,15 @@ else if (encoding == 1) // ISO 10646= charset = StandardCharsets.UTF_16BE; } } - String value = nameRecord.getString(); - if (nameRecord.getNameId() == 6 && prefix != null) + byte[] value = name.getStringBytes(nameRecord); + if (nameRecord.getNameId() == 6 && prefix != null && value != null) { - value = prefix + value; + byte[] prefixBytes = prefix.getBytes(charset); + byte[] result = Arrays.copyOf(prefixBytes, prefixBytes.length + value.length); + System.arraycopy(value, 0, result, prefixBytes.length, value.length); + value = result; } - names[j] = value.getBytes(charset); + names[j] = value; j++; } } diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeCollection.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeCollection.java index 69b16134965..9c075a77e4d 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeCollection.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeCollection.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.RandomAccessReadBuffer; @@ -46,7 +47,18 @@ public class TrueTypeCollection implements Closeable */ public TrueTypeCollection(File file) throws IOException { - this(new RandomAccessReadBufferedFile(file)); + this(new RandomAccessReadBufferedFile(file), false); + } + + /** + * Creates a new TrueTypeCollection from a .ttc file. + * + * @param file The TTC file. + * @throws IOException If the font could not be parsed. + */ + public TrueTypeCollection(File file, boolean dontReadToByteArray) throws IOException + { + this(new RandomAccessReadBufferedFile(file), dontReadToByteArray); } /** @@ -57,7 +69,7 @@ public TrueTypeCollection(File file) throws IOException */ public TrueTypeCollection(InputStream stream) throws IOException { - this(new RandomAccessReadBuffer(stream)); + this(new RandomAccessReadBuffer(stream), false); } /** @@ -66,9 +78,11 @@ public TrueTypeCollection(InputStream stream) throws IOException * @param randomAccessRead * @throws IOException If the font could not be parsed. */ - TrueTypeCollection(RandomAccessRead randomAccessRead) throws IOException + TrueTypeCollection(RandomAccessRead randomAccessRead, boolean dontReadToByteArray) throws IOException { - this.stream = new RandomAccessReadDataStream(randomAccessRead); + this.stream = dontReadToByteArray + ? new RandomAccessReadUncachedDataStream(randomAccessRead) + : new RandomAccessReadDataStream(randomAccessRead); // TTC header String tag = stream.readTag(); @@ -106,12 +120,31 @@ public void processAllFonts(TrueTypeFontProcessor trueTypeFontProcessor) throws { for (int i = 0; i < numFonts; i++) { - TrueTypeFont font = getFontAtIndex(i); + TrueTypeFont font = getFontAtIndex(i, null); trueTypeFontProcessor.process(font); } } - private TrueTypeFont getFontAtIndex(int idx) throws IOException + /** + * Run the callback for each TT font in the collection. + * + * @param trueTypeFontProcessor the object with the callback method. + * @throws IOException if something went wrong when calling the TrueTypeFontProcessor + */ + public void processAllFontHeaders(TrueTypeFontHeadersProcessor trueTypeFontProcessor) throws IOException + { + assert stream instanceof RandomAccessReadUncachedDataStream + : "For efficiency, we do not read whole file to byte[]"; + for (int i = 0; i < numFonts; i++) + { + LoadOnlyHeaders headers = new LoadOnlyHeaders(); + TrueTypeFont ttf = getFontAtIndex(i, headers); + IOUtils.closeQuietly(ttf); // all data is already saved in 'headers' + trueTypeFontProcessor.process(headers); + } + } + + private TrueTypeFont getFontAtIndex(int idx, LoadOnlyHeaders onlyHeaders) throws IOException { stream.seek(fontOffsets[idx]); TTFParser parser; @@ -123,6 +156,7 @@ private TrueTypeFont getFontAtIndex(int idx) throws IOException { parser = new TTFParser(false); } + parser.setLoadOnlyHeaders(onlyHeaders); stream.seek(fontOffsets[idx]); return parser.parse(new TTCDataStream(stream)); } @@ -138,7 +172,7 @@ public TrueTypeFont getFontByName(String name) throws IOException { for (int i = 0; i < numFonts; i++) { - TrueTypeFont font = getFontAtIndex(i); + TrueTypeFont font = getFontAtIndex(i, null); if (font.getName().equals(name)) { return font; @@ -156,6 +190,15 @@ public interface TrueTypeFontProcessor void process(TrueTypeFont ttf) throws IOException; } + /** + * Implement the callback method to call {@link TrueTypeCollection#processAllFontHeaders(LoadOnlyHeaders)}. + */ + @FunctionalInterface + public interface TrueTypeFontHeadersProcessor + { + void process(LoadOnlyHeaders ttf) throws IOException; + } + @Override public void close() throws IOException { diff --git a/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeFont.java b/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeFont.java index 27080ae31cb..0a4d6ad508d 100644 --- a/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeFont.java +++ b/fontbox/src/main/java/org/apache/fontbox/ttf/TrueTypeFont.java @@ -54,6 +54,7 @@ public class TrueTypeFont implements FontBoxFont, Closeable private final Object lockReadtable = new Object(); private final Object lockPSNames = new Object(); private final List enabledGsubFeatures = new ArrayList<>(); + private LoadOnlyHeaders loadOnlyHeaders; /** * Constructor. Clients should use the TTFParser to create a new TrueTypeFont object. @@ -143,6 +144,32 @@ public byte[] getTableBytes(TTFTable table) throws IOException } } + /** + * Returns the raw bytes of the given table, no more than {@code limit} bytes. + * + * @param table the table to read. + * @param limit maximum length of array to return + * @return the raw bytes of the given table + * + * @throws IOException if there was an error accessing the table. + */ + public byte[] getTableNBytes(TTFTable table, int limit) throws IOException + { + synchronized (lockReadtable) + { + // save current position + long currentPosition = data.getCurrentPosition(); + data.seek(table.getOffset()); + + // read all data + byte[] bytes = data.read(Math.min(limit, (int) table.getLength())); + + // restore current position + data.seek(currentPosition); + return bytes; + } + } + /** * This will get the table for the given tag. * @@ -768,6 +795,15 @@ public void enableVerticalSubstitutions() enableGsubFeature("vert"); } + void setLoadOnlyHeaders(LoadOnlyHeaders loadOnlyHeaders) { + this.loadOnlyHeaders = loadOnlyHeaders; + } + + /** Used by table parsers to detect 'only headers' mode */ + LoadOnlyHeaders getLoadOnlyHeaders() { + return loadOnlyHeaders; + } + @Override public String toString() { diff --git a/io/src/main/java/org/apache/pdfbox/io/RandomAccessRead.java b/io/src/main/java/org/apache/pdfbox/io/RandomAccessRead.java index 41bb9c67327..04f2d0f1d41 100644 --- a/io/src/main/java/org/apache/pdfbox/io/RandomAccessRead.java +++ b/io/src/main/java/org/apache/pdfbox/io/RandomAccessRead.java @@ -18,6 +18,7 @@ import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; /** * An interface allowing random access read operations. @@ -55,7 +56,85 @@ default int read(byte[] b) throws IOException * @throws IOException If there was an error while reading the data. */ int read(byte[] b, int offset, int length) throws IOException; - + + /** + * @throws IOException if less than {@code length} bytes are available + */ + default byte[] readExact(byte[] b, int offset, int length) throws IOException + { + if (length() - getPosition() < length) + { + throw new IOException("End-of-data"); + } + int read = readNBytes(b, offset, length); + if (read < length) + { + rewind(read); + throw new IOException("End-of-data"); + } + return b; + } + + /** + * @throws IOException if less than {@code length} bytes are available + */ + default byte[] readExact(int length) throws IOException + { + return readExact(new byte[length], 0, length); + } + + /** + * Finishes when {@code length} bytes are read, or EOF. Always returns {@code result}, never trims. + * @see InputStream#readNBytes(byte[], int, int) + * @return when {@code result.length} bytes are read, or EOF + */ + default int readNBytes(byte[] result) throws IOException + { + return readNBytes(result, 0, result.length); + } + + /** + * Finishes when {@code length} bytes are read, or EOF. Always returns {@code byte[length]}, never trims. + * @see InputStream#readNBytes(byte[], int, int) + * @return when {@code length} bytes are read, or EOF + */ + default byte[] readNBytes(int length) throws IOException + { + byte[] result = new byte[length]; + readNBytes(result, 0, result.length); + return result; + } + + /** + * Finishes when {@code length} bytes are read, or EOF. + * @see InputStream#readNBytes(byte[], int, int) + * @return amount of read bytes + */ + default int readNBytes(byte[] result, int offset, int length) throws IOException + { + if (Integer.MAX_VALUE - length < offset) + { + throw new IOException("Integer overflow"); + } + int cursor = offset; + int end = offset + length; + while (cursor < end) + { + int read = read(result, cursor, end - cursor); + if (read < 0) + { + break; + } + else if (read == 0) + { + // in order to not get stuck in a loop we check readBytes (this should never happen) + throw new IOException("Read 0 bytes, risk of an infinite loop"); + } + cursor += read; + } + return cursor - offset; + } + /** * Returns offset of next byte to be returned by a read method. * diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java index 5188f13c3f7..42371bdc21e 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java @@ -445,12 +445,12 @@ private int checkForEndOfString(final int bracesParameter) throws IOException } // Check the next 3 bytes if available byte[] nextThreeBytes = new byte[3]; - int amountRead = source.read(nextThreeBytes); + int amountRead = source.readNBytes(nextThreeBytes); if (amountRead > 0) { source.rewind(amountRead); } - if (amountRead < 3) + if (amountRead < 2) { return bracesParameter; } @@ -465,6 +465,7 @@ private int checkForEndOfString(final int bracesParameter) throws IOException && (nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')) // || // (nextThreeBytes[0] == ASCII_CR && nextThreeBytes[1] == ASCII_LF + && amountRead >= 3 && (nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>')) // ) { diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java index d4d328be713..9d7d8edb89c 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java @@ -502,23 +502,9 @@ private long getStartxrefOffset() throws IOException try { final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes; - buf = new byte[trailByteCount]; skipBytes = fileLen - trailByteCount; source.seek(skipBytes); - int off = 0; - int readBytes; - while (off < trailByteCount) - { - readBytes = source.read(buf, off, trailByteCount - off); - // in order to not get stuck in a loop we check readBytes (this should never happen) - if (readBytes < 1) - { - throw new IOException( - "No more bytes to read for trailing buffer, but expected: " - + (trailByteCount - off)); - } - off += readBytes; - } + buf = source.readExact(trailByteCount); } finally { diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java index c6660b43dee..3cbb5a7249a 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java @@ -321,7 +321,7 @@ else if( next.equals( "false" ) ) private boolean hasNoFollowingBinData() throws IOException { // as suggested in PDFBOX-1164 - final int readBytes = source.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH); + final int readBytes = source.readNBytes(binCharTestArr); boolean noBinData = true; int startOpIdx = -1; int endOpIdx = -1; diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java index 6202660de4f..c398218e7da 100755 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java @@ -124,7 +124,7 @@ public void parse(XrefTrailerResolver resolver) throws IOException byte[] currLine = new byte[w[0] + w[1] + w[2]]; while (!isEOF() && objectNumbers.hasNext()) { - readNextValue(currLine); + source.readNBytes(currLine); // get the current objID long objID = objectNumbers.next(); // default value is 1 if w[0] == 0, otherwise parse first field @@ -154,16 +154,6 @@ public void parse(XrefTrailerResolver resolver) throws IOException close(); } - private void readNextValue(byte[] value) throws IOException - { - int remainingBytes = value.length; - int amountRead; - while ((amountRead = source.read(value, value.length - remainingBytes, remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } - } - private long parseValue(byte[] data, int start, int length) { long value = 0; diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FileSystemFontProvider.java b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FileSystemFontProvider.java index cd09e36b98a..17a9ce6eaf9 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FileSystemFontProvider.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FileSystemFontProvider.java @@ -37,9 +37,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.fontbox.FontBoxFont; -import org.apache.fontbox.cff.CFFCIDFont; -import org.apache.fontbox.cff.CFFFont; -import org.apache.fontbox.ttf.NamingTable; +import org.apache.fontbox.ttf.LoadOnlyHeaders; import org.apache.fontbox.ttf.OS2WindowsMetricsTable; import org.apache.fontbox.ttf.OTFParser; import org.apache.fontbox.ttf.OpenTypeFont; @@ -59,6 +57,12 @@ final class FileSystemFontProvider extends FontProvider { private static final Log LOG = LogFactory.getLog(FileSystemFontProvider.class); + /** + * This option changes publicly visible behaviour: ".pdfbox.cache" file will have hash="-" for all files. + * After implementing {@link LoadOnlyHeaders}, parsing font headers is faster than checksumming. + */ + private static final boolean SKIP_CHECKSUMS = "true".equals(System.getProperty("pdfbox.fontcache.skipchecksums")); + private static final String CHECKSUM_PLACEHOLDER = "-"; private final List fontInfoList = new ArrayList<>(); private final FontCache cache; @@ -315,7 +319,7 @@ private FSFontInfo createFSIgnored(File file, FontFormat format, String postScri String hash; try { - hash = computeHash(Files.newInputStream(file.toPath())); + hash = SKIP_CHECKSUMS ? CHECKSUM_PLACEHOLDER : computeHash(file); } catch (IOException ex) { @@ -537,6 +541,10 @@ private List loadDiskCache(List files) { try (BufferedReader reader = new BufferedReader(new FileReader(diskCacheFile))) { + // consequent lines usually share the same font file (e.g. "Courier", "Courier-Bold", "Courier-Oblique") + File lastFile = null; + String lastHash = null; + // String line; while ((line = reader.readLine()) != null) { @@ -599,23 +607,36 @@ private List loadDiskCache(List files) } if (fontFile.exists()) { - boolean keep = false; // if the file exists, find out whether it's the same file. // first check whether time is different and if yes, whether hash is different - if (fontFile.lastModified() != lastModified) + boolean keep = fontFile.lastModified() == lastModified; + if (!keep && !SKIP_CHECKSUMS) { - String newHash = computeHash(Files.newInputStream(fontFile.toPath())); - if (newHash.equals(hash)) + String newHash; + if (hash.equals(lastHash) && fontFile.equals(lastFile)) + { + newHash = lastHash; // already computed + } + else + { + try + { + newHash = computeHash(fontFile); + lastFile = fontFile; + lastHash = newHash; + } + catch (IOException ex) + { + LOG.debug("Error reading font file " + fontFile.getAbsolutePath(), ex); + newHash = ""; + } + } + if (hash.equals(newHash)) { keep = true; lastModified = fontFile.lastModified(); - hash = newHash; } } - else - { - keep = true; - } if (keep) { FSFontInfo info = new FSFontInfo(fontFile, format, postScriptName, @@ -658,9 +679,10 @@ private List loadDiskCache(List files) */ private void addTrueTypeCollection(final File ttcFile) throws IOException { - try (TrueTypeCollection ttc = new TrueTypeCollection(ttcFile)) + try (TrueTypeCollection ttc = new TrueTypeCollection(ttcFile, true)) { - ttc.processAllFonts(ttf -> addTrueTypeFontImpl(ttf, ttcFile)); + String hash = SKIP_CHECKSUMS ? CHECKSUM_PLACEHOLDER : computeHash(ttcFile); + ttc.processAllFontHeaders(ttf -> addTrueTypeFontImpl(ttf, ttcFile, hash)); } catch (IOException e) { @@ -677,20 +699,22 @@ private void addTrueTypeFont(File ttfFile) throws IOException FontFormat fontFormat = null; try { + TTFParser parser; if (ttfFile.getPath().toLowerCase().endsWith(".otf")) { fontFormat = FontFormat.OTF; - OTFParser parser = new OTFParser(false); - OpenTypeFont otf = parser.parse(new RandomAccessReadBufferedFile(ttfFile)); - addTrueTypeFontImpl(otf, ttfFile); + parser = new OTFParser(false); } else { fontFormat = FontFormat.TTF; - TTFParser parser = new TTFParser(false); - TrueTypeFont ttf = parser.parse(new RandomAccessReadBufferedFile(ttfFile)); - addTrueTypeFontImpl(ttf, ttfFile); + parser = new TTFParser(false); } + LoadOnlyHeaders headers = new LoadOnlyHeaders(); + parser.setLoadOnlyHeaders(headers); + IOUtils.closeQuietly(parser.parse(new RandomAccessReadBufferedFile(ttfFile))); + addTrueTypeFontImpl(headers, ttfFile, + SKIP_CHECKSUMS ? CHECKSUM_PLACEHOLDER : computeHash(ttfFile)); } catch (IOException e) { @@ -702,25 +726,27 @@ private void addTrueTypeFont(File ttfFile) throws IOException /** * Adds an OTF or TTF font to the file cache. To reduce memory, the parsed font is not cached. */ - private void addTrueTypeFontImpl(TrueTypeFont ttf, File file) throws IOException + private void addTrueTypeFontImpl(LoadOnlyHeaders ttf, File file, String fileHash) throws IOException { - try + final IOException exception = ttf.getException(); + if (exception == null) { // read PostScript name, if any - if (ttf.getName() != null && ttf.getName().contains("|")) + final String name = ttf.getName(); + if (name != null && name.contains("|")) { fontInfoList.add(createFSIgnored(file, FontFormat.TTF, "*skippipeinname*")); - LOG.warn("Skipping font with '|' in name " + ttf.getName() + " in file " + file); + LOG.warn("Skipping font with '|' in name " + name + " in file " + file); } - else if (ttf.getName() != null) + else if (name != null) { // ignore bitmap fonts - if (ttf.getHeader() == null) + Integer macStyle = ttf.getHeaderMacStyle(); + if (macStyle == null) { - fontInfoList.add(createFSIgnored(file, FontFormat.TTF, ttf.getName())); + fontInfoList.add(createFSIgnored(file, FontFormat.TTF, name)); return; } - int macStyle = ttf.getHeader().getMacStyle(); int sFamilyClass = -1; int usWeightClass = -1; @@ -738,59 +764,41 @@ else if (ttf.getName() != null) panose = os2WindowsMetricsTable.getPanose(); } - String hash = computeHash(ttf.getOriginalData()); - String format; - if (ttf instanceof OpenTypeFont && ((OpenTypeFont) ttf).isPostScript()) + FontFormat format; + CIDSystemInfo ros = null; + if (ttf.isOpenTypePostScript()) { - format = "OTF"; - CIDSystemInfo ros = null; - OpenTypeFont otf = (OpenTypeFont) ttf; - if (otf.isSupportedOTF() && otf.getCFF() != null) - { - CFFFont cff = otf.getCFF().getFont(); - if (cff instanceof CFFCIDFont) - { - CFFCIDFont cidFont = (CFFCIDFont) cff; - String registry = cidFont.getRegistry(); - String ordering = cidFont.getOrdering(); - int supplement = cidFont.getSupplement(); - ros = new CIDSystemInfo(registry, ordering, supplement); - } + format = FontFormat.OTF; + String registry = ttf.getOtfRegistry(); + String ordering = ttf.getOtfOrdering(); + if (registry != null || ordering != null) { + ros = new CIDSystemInfo(registry, ordering, ttf.getOtfSupplement()); } - fontInfoList.add(new FSFontInfo(file, FontFormat.OTF, ttf.getName(), ros, - usWeightClass, sFamilyClass, ulCodePageRange1, ulCodePageRange2, - macStyle, panose, this, hash, file.lastModified())); } else { - CIDSystemInfo ros = null; - if (ttf.getTableMap().containsKey("gcid")) + byte[] gcid = ttf.getNonOtfTableGCID142(); + if (gcid != null) { // Apple's AAT fonts have a "gcid" table with CID info - byte[] bytes = ttf.getTableBytes(ttf.getTableMap().get("gcid")); - String reg = new String(bytes, 10, 64, StandardCharsets.US_ASCII); + String reg = new String(gcid, 10, 64, StandardCharsets.US_ASCII); String registryName = reg.substring(0, reg.indexOf('\0')); - String ord = new String(bytes, 76, 64, StandardCharsets.US_ASCII); + String ord = new String(gcid, 76, 64, StandardCharsets.US_ASCII); String orderName = ord.substring(0, ord.indexOf('\0')); - int supplementVersion = bytes[140] << 8 & (bytes[141] & 0xFF); + int supplementVersion = gcid[140] << 8 & (gcid[141] & 0xFF); ros = new CIDSystemInfo(registryName, orderName, supplementVersion); } - - format = "TTF"; - fontInfoList.add(new FSFontInfo(file, FontFormat.TTF, ttf.getName(), ros, - usWeightClass, sFamilyClass, ulCodePageRange1, ulCodePageRange2, - macStyle, panose, this, hash, file.lastModified())); + format = FontFormat.TTF; } + fontInfoList.add(new FSFontInfo(file, format, name, ros, + usWeightClass, sFamilyClass, ulCodePageRange1, ulCodePageRange2, + macStyle, panose, this, fileHash, file.lastModified())); if (LOG.isTraceEnabled()) { - NamingTable name = ttf.getNaming(); - if (name != null) - { - LOG.trace(format +": '" + name.getPostScriptName() + "' / '" + - name.getFontFamily() + "' / '" + - name.getFontSubFamily() + "'"); - } + LOG.trace(format.name() +": '" + name + "' / '" + + ttf.getFontFamily() + "' / '" + + ttf.getFontSubFamily() + "'"); } } else @@ -799,14 +807,10 @@ else if (ttf.getName() != null) LOG.warn("Missing 'name' entry for PostScript name in font " + file); } } - catch (IOException e) + else { fontInfoList.add(createFSIgnored(file, FontFormat.TTF, "*skipexception*")); - LOG.warn("Could not load font file: " + file, e); - } - finally - { - ttf.close(); + LOG.warn("Could not load font file: " + file, exception); } } @@ -830,7 +834,7 @@ private void addType1Font(File pfbFile) throws IOException LOG.warn("Skipping font with '|' in name " + type1.getName() + " in file " + pfbFile); return; } - String hash = computeHash(Files.newInputStream(pfbFile.toPath())); + String hash = SKIP_CHECKSUMS ? CHECKSUM_PLACEHOLDER : computeHash(pfbFile); fontInfoList.add(new FSFontInfo(pfbFile, FontFormat.PFB, type1.getName(), null, -1, -1, 0, 0, -1, null, this, hash, pfbFile.lastModified())); @@ -868,11 +872,11 @@ public List getFontInfo() return fontInfoList; } - private static String computeHash(InputStream is) throws IOException + private static String computeHash(File file) throws IOException { CRC32 crc = new CRC32(); - try + try (InputStream is = Files.newInputStream(file.toPath())) { byte[] buffer = new byte[4096]; int readBytes; @@ -884,9 +888,5 @@ private static String computeHash(InputStream is) throws IOException long hash = crc.getValue(); return Long.toHexString(hash); } - finally - { - IOUtils.closeQuietly(is); - } } } diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java index 79e0c6a1edd..ec3c8e2c984 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java @@ -504,14 +504,7 @@ private TTFParser getParser(RandomAccessRead randomAccessRead, boolean isEmbedde throws IOException { long startPos = randomAccessRead.getPosition(); - byte[] tagBytes = new byte[4]; - int remainingBytes = tagBytes.length; - int amountRead; - while ((amountRead = randomAccessRead.read(tagBytes, tagBytes.length - remainingBytes, - remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } + byte[] tagBytes = randomAccessRead.readNBytes(4); randomAccessRead.seek(startPos); if ("OTTO".equals(new String(tagBytes, StandardCharsets.US_ASCII))) { diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java index 42e16523579..7e68104f4ff 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java @@ -341,15 +341,7 @@ private static byte[] getFontHeader(COSDictionary fontDescriptor) throws IOExcep { try (RandomAccessRead fontView = fontFile.createView()) { - int headerLength = 4; - header = new byte[headerLength]; - int remainingBytes = headerLength; - int amountRead; - while ((amountRead = fontView.read(header, headerLength - remainingBytes, - remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } + header = fontView.readNBytes(4); } catch (IOException ex) { diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDTrueTypeFont.java b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDTrueTypeFont.java index 34c560e4950..3610cd58294 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDTrueTypeFont.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDTrueTypeFont.java @@ -770,14 +770,7 @@ private TTFParser getParser(RandomAccessRead randomAccessRead, boolean isEmbedde throws IOException { long startPos = randomAccessRead.getPosition(); - byte[] tagBytes = new byte[4]; - int remainingBytes = tagBytes.length; - int amountRead; - while ((amountRead = randomAccessRead.read(tagBytes, tagBytes.length - remainingBytes, - remainingBytes)) > 0) - { - remainingBytes -= amountRead; - } + byte[] tagBytes = randomAccessRead.readNBytes(4); randomAccessRead.seek(startPos); if ("OTTO".equals(new String(tagBytes, StandardCharsets.US_ASCII))) {