Skip to content

Commit

Permalink
Proper fix for invalid startxref.
Browse files Browse the repository at this point in the history
  • Loading branch information
mlaukala committed Jun 6, 2018
1 parent afee7e1 commit 236c74c
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 135 deletions.
33 changes: 32 additions & 1 deletion src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public PdfTrailer(PdfCrossReferenceStream trailer)
if (id != null)
Elements.SetValue(Keys.ID, id);
}

public int Size
{
get { return Elements.GetInteger(Keys.Size); }
Expand Down Expand Up @@ -218,6 +218,37 @@ internal void Finish()
_document._irefTable.IsUnderConstruction = false;
}

/// <summary>
/// Constructs the PdfTrailer from a document.
/// </summary>
/// <param name="parser">the parser used to read the file.</param>
internal void ConstructFromDocument(Parser parser)
{
// TODO - May need to also search for encryption related trailer info
PdfCrossReferenceTable xrefTable = _document._irefTable;
Elements.SetInteger(Keys.Size, xrefTable.ObjectTable.Count);

// find the root.
PdfDictionary rootToUse = null;
foreach (var reference in xrefTable.AllReferences)
{
PdfObject obj = parser.ReadObject(null, reference.ObjectID, false, false);
if (obj is PdfDictionary dObj)
{
if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog")
{
if (rootToUse == null)
rootToUse = dObj;
else if (dObj.ObjectID.GenerationNumber > rootToUse.ObjectID.GenerationNumber)
rootToUse = dObj;
}
}
}

if (rootToUse != null)
Elements.SetReference(Keys.Root, rootToUse);
}

/// <summary>
/// Predefined keys of this dictionary.
/// </summary>
Expand Down
5 changes: 4 additions & 1 deletion src/PdfSharp/Pdf.IO/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ public bool TryScanNextToken(out Symbol symbol, out int position)
case '%':
// Eat comments, the parser doesn't handle them
//return symbol = ScanComment();
ScanComment();
symbol = _symbol = ScanComment();
// Do not eat EOF
if (symbol == Symbol.Eof)
return true;
goto Again;

case '/':
Expand Down
234 changes: 107 additions & 127 deletions src/PdfSharp/Pdf.IO/Parser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1110,62 +1110,31 @@ internal PdfTrailer ReadTrailer()
if (idx == -1)
throw new Exception("The StartXRef table could not be found, the file cannot be opened.");

ReadSymbol(Symbol.StartXRef);
int startxref = _lexer.Position = ReadInteger();

// Must be before the first 'goto valid_xref;' statement.
int xref_offset = 0;

// Check for valid startxref
if (IsValidXref())
{
goto valid_xref;
}

// If we reach this point, we have an invalid startxref
// First look for bytes preceding "%PDF-". Some pdf producers ignore these.
if (length >= 1024)
{
// "%PDF-" should be in this range
string header = _lexer.ReadRawString(0, 1024);
idx = header.IndexOf("%PDF-", StringComparison.Ordinal);
}
else
{
string header = _lexer.ReadRawString(0, length);
idx = header.IndexOf("%PDF-", StringComparison.Ordinal);
}

if (idx > 0)
{
//_lexer.ByteOffset = idx;
_lexer.Position = startxref + idx;
if (IsValidXref())
{
xref_offset = idx;
goto valid_xref;
}
}

_lexer.Position = startxref;
Symbol s = ReadSymbol(Symbol.StartXRef);
_lexer.Position = ReadInteger();

// Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
// Problem: certificate data added to the start of file. Invalid startxref byte offset
// Fix: We could search for the a valid xref table but all byte offsets are probably incorrect.
// Probably best to just recreate the xref table.
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.5

// Check for valid startxref
if (!IsValidXref())
{
PdfTrailer trailer = TryRecreateXRefTableAndTrailer(_document._irefTable);
if (trailer == null)
PdfTrailer trailer;
bool bSuccess = TryRecreateXRefTableAndTrailer(out trailer, _document);
if (!bSuccess)
throw new Exception("Could not recreate the xref table or trailer.");

_document._trailer = trailer;
return _document._trailer;
}

valid_xref:
_lexer.Position = startxref + xref_offset;


// Read all trailers.
while (true)
{
PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset);
PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable);
// 1st trailer seems to be the best.
if (_document._trailer == null)
_document._trailer = trailer;
Expand All @@ -1186,114 +1155,101 @@ internal PdfTrailer ReadTrailer()
/// <returns></returns>
private bool IsValidXref()
{
int length = _lexer.PdfLength;
int position = _lexer.Position;
// Make sure not inside a stream.

string content = "";
int content_pos = position;
while (true)
try
{
// look for stream and endstream in 1k chunks.
int read_length = Math.Min(1024, length - content_pos);
content += _lexer.ReadRawString(content_pos, read_length);

int ss = content.IndexOf("stream", StringComparison.Ordinal);
int es = content.IndexOf("endstream", StringComparison.Ordinal);
int eof = content.IndexOf("%%EOF", StringComparison.Ordinal);
Symbol symbol = ScanNextToken();
if (symbol == Symbol.XRef) // xref table
{
_lexer.Position = position;
return true;
}

if (ss != es)
if (symbol == Symbol.Integer) // Linearization parameter dictionary
{
if (ss == -1)
{
if (eof != -1 && eof < es)
break;
else
return false;
}
else if (es == -1)
break;
else if (ss < es)
break;
else if (ss > es)
// Just because we have an integer, doesn't mean the startxref is actually valid
if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj)
{
if (eof != -1 && eof < ss && eof < es)
break;
else
return false;
_lexer.Position = position;
return true;
}
}

if (eof != -1)
break;

content_pos = content_pos + read_length;
if (content_pos + read_length >= length)
{
// reached the end of the document without finding either.
break;
}
_lexer.Position = position;
return false;
}
catch
{
_lexer.Position = position;
return false;
}
}

_lexer.Position = position;
private bool TryRecreateXRefTableAndTrailer(out PdfTrailer trailer, PdfDocument document)
{
PdfCrossReferenceTable xrefTable = document._irefTable;
trailer = null;
int length = _lexer.PdfLength;

Symbol symbol = ScanNextToken();
if (symbol == Symbol.XRef)
// because some pdf producers put random info before the header, we need to find a proper starting position.
// i.e. Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
int startIdx = -1;
string contents = "";
for (int i = 0, pos = 0; startIdx == -1 && pos < length; i++, pos = 1024 * i)
{
return true;
int len = Math.Min(1024, length - pos);
contents = $"{contents}{_lexer.ReadRawString(pos, len)}";
startIdx = contents.IndexOf("%PDF-1.", StringComparison.Ordinal);
}

if (symbol == Symbol.Integer)
if (startIdx == -1)
return false;

// Don't look past the last %%EOF marker
int endIdx = -1;
contents = "";
for (int i = 1; endIdx == -1; i++)
{
// Just because we have an integer, doesn't mean the startxref is actually valid
if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj)
int pos = length - (1024 * i);
int len = 1024;

if (pos < 0)
{
return true;
len = len + pos;
pos = 0;
}
}

return false;
}

private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
{
// Let's first check for a trailer
int length = _lexer.PdfLength;

int trail_idx;
if (length >= 1024)
{
string trail = _lexer.ReadRawString(length - 1024, 1024);
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
_lexer.Position = length - 1024 + trail_idx;
}
else
{
string trail = _lexer.ReadRawString(0, length);
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
_lexer.Position = trail_idx;
}
contents = $"{_lexer.ReadRawString(pos, len)}{contents}";
endIdx = contents.LastIndexOf("%%EOF", StringComparison.Ordinal);
if (endIdx != -1)
endIdx = length - contents.Length + endIdx;

if (trail_idx == -1)
return null; //TODO: Look for compressed xref table that should contain the trailer
if (pos == 0)
break;
}

ReadSymbol(Symbol.Trailer);
ReadSymbol(Symbol.BeginDictionary);
PdfTrailer trailer = new PdfTrailer(_document);
ReadDictionary(trailer, false);
if (endIdx == -1)
return false;

endIdx = endIdx + 5; // This should be where Eof char is

// Recreate the xref table.
//
// When symbol == Symbol.Obj
// [0] - generation
// [1] - id
TokenInfo[] token_stack = new TokenInfo[2];
_lexer.Position = 0;

_lexer.Position = startIdx;
while (true)
{
Symbol symbol = ScanNextToken(out int position);
if (symbol == Symbol.Eof)
break;
if (symbol == Symbol.Eof)
{
// Check if it's the last EOF
if (_lexer.Position >= endIdx)
break; // This is the end of the file.
}

// we need to skip over streams entirely
if (symbol == Symbol.BeginStream)
Expand Down Expand Up @@ -1327,21 +1283,45 @@ private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTab
token_stack[0].Symbol == Symbol.Integer &&
token_stack[1].Symbol == Symbol.Integer)
{
// TODO:: Do we only need the most recent revision?
PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number);
if (!xrefTable.Contains(objectID))
xrefTable.Add(new PdfReference(objectID, token_stack[1].Position));
//ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after
//SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions
}

token_stack[1] = token_stack[0];
token_stack[1] = token_stack[0];
TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position };
if (symbol == Symbol.Integer)
token_info.Number = _lexer.TokenToInteger;
token_stack[0] = token_info;
}

return trailer;
// find the root.
// foreach (var reference in xrefTable.AllReferences)
// {
// PdfObject obj = ReadObject(null, reference.ObjectID, false, false);
// if (obj is PdfDictionary dObj)
// {
// if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog")
// {
// PdfCatalog catalog = new PdfCatalog(dObj);
// }
// }
// }








trailer = new PdfTrailer(_document);
trailer.ConstructFromDocument(this);

return true;
}

struct TokenInfo
Expand All @@ -1354,7 +1334,7 @@ struct TokenInfo
/// <summary>
/// Reads cross reference table(s) and trailer(s).
/// </summary>
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset)
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
{
Debug.Assert(xrefTable != null);

Expand All @@ -1372,7 +1352,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int
int length = ReadInteger();
for (int id = start; id < start + length; id++)
{
int position = ReadInteger() + xrefOffset;
int position = ReadInteger();
int generation = ReadInteger();
ReadSymbol(Symbol.Keyword);
string token = _lexer.Token;
Expand Down
Loading

0 comments on commit 236c74c

Please sign in to comment.