Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

made fixes for really broken but readable files. #32

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions src/PdfSharp/Pdf.IO/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,35 @@ public int Position
}
}

/// <summary>
/// Reads the next token and returns its type. If the token starts with a digit, the parameter
/// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
/// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
/// the token is set to the object ID followed by the generation number separated by a blank
/// (the 'R' is omitted from the token).
/// </summary>
// /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
public Symbol ScanNextToken()
/// <summary>
/// Reads the next token and returns its type. If the token starts with a digit, the parameter
/// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
/// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
/// the token is set to the object ID followed by the generation number separated by a blank
/// (the 'R' is omitted from the token).
/// </summary>
// /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
public Symbol ScanNextToken()
{
return ScanNextToken(out int location);
}

/// <summary>
/// Reads the next token and returns its type. If the token starts with a digit, the parameter
/// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
/// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
/// the token is set to the object ID followed by the generation number separated by a blank
/// (the 'R' is omitted from the token).
/// </summary>
// /// <param name="location">The start position of the next token.</param>
public Symbol ScanNextToken(out int position)
{
Again:
_token = new StringBuilder();

char ch = MoveToNonWhiteSpace();
switch (ch)
position = Position;
switch (ch)
{
case '%':
// Eat comments, the parser doesn't handle them
Expand Down Expand Up @@ -190,7 +204,26 @@ public byte[] ReadStream(int length)
else
pos = _idxChar + 1;

_pdfSteam.Position = pos;
// Verify stream length and resolve if bad
string post_stream = ReadRawString(pos + length, ("endstream").Length);
if (post_stream != "endstream")
{
// find the first endstream occurrence
// first check to see if it is within the specified stream length.
int endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal);
if (endstream_idx == -1)
{
post_stream = ReadRawString(pos, _pdfLength - pos);
endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal);
}

if (endstream_idx != -1)
{
length = endstream_idx;
}
}

_pdfSteam.Position = pos;
byte[] bytes = new byte[length];
int read = _pdfSteam.Read(bytes, 0, length);
Debug.Assert(read == length);
Expand Down
201 changes: 195 additions & 6 deletions src/PdfSharp/Pdf.IO/Parser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,12 @@ private Symbol ScanNextToken()
return _lexer.ScanNextToken();
}

private Symbol ScanNextToken(out string token)
private Symbol ScanNextToken(out int position)
{
return _lexer.ScanNextToken(out position);
}

private Symbol ScanNextToken(out string token)
{
Symbol symbol = _lexer.ScanNextToken();
token = _lexer.Token;
Expand Down Expand Up @@ -1031,10 +1036,23 @@ internal PdfTrailer ReadTrailer()
throw new Exception("The StartXRef table could not be found, the file cannot be opened.");

ReadSymbol(Symbol.StartXRef);
_lexer.Position = ReadInteger();
int startxref = _lexer.Position = ReadInteger();

// Read all trailers.
while (true)
// Check for valid startxref
if (!IsValidXref())
{
PdfTrailer trailer = TryRecreateXRefTableAndTrailer(_document._irefTable);
if (trailer == null)
throw new Exception("Could not recreate the xref table or trailer.");

_document._trailer = trailer;
return _document._trailer;
}

_lexer.Position = startxref;

// Read all trailers.
while (true)
{
PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable);
// 1st trailer seems to be the best.
Expand All @@ -1052,9 +1070,180 @@ internal PdfTrailer ReadTrailer()
}

/// <summary>
/// Reads cross reference table(s) and trailer(s).
/// Checks that the current _lexer location is a valid xref.
/// </summary>
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
/// <returns></returns>
private bool IsValidXref()
{
int length = _lexer.PdfLength;
int position = _lexer.Position;
// Make sure not inside a stream.

string content = "";
int content_pos = position;
while (true)
{
// look for stream and endstream in 1k chunks.
int read_length = Math.Min(1024, length - content_pos);
content += _lexer.ReadRawString(content_pos, read_length);

int ss = content.IndexOf("stream", StringComparison.Ordinal);
int es = content.IndexOf("endstream", StringComparison.Ordinal);
int eof = content.IndexOf("%%EOF", StringComparison.Ordinal);

if (ss != es)
{
if (ss == -1)
{
if (eof != -1 && eof < es)
break;
else
return false;
}
else if (es == -1)
break;
else if (ss < es)
break;
else if (ss > es)
{
if (eof != -1 && eof < ss && eof < es)
break;
else
return false;
}
}

if (eof != -1)
break;

content_pos = content_pos + read_length;
if (content_pos + read_length >= length)
{
// reached the end of the document without finding either.
break;
}
}

_lexer.Position = position;

Symbol symbol = ScanNextToken();
if (symbol == Symbol.XRef)
{
return true;
}

if (symbol == Symbol.Integer)
{
// Just because we have an integer, doesn't mean the startxref is actually valid
if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj)
{
return true;
}
}

return false;
}

private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
{
// Let's first check for a trailer
int length = _lexer.PdfLength;

int trail_idx;
if (length >= 1024)
{
string trail = _lexer.ReadRawString(length - 1024, 1024);
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
_lexer.Position = length - 1024 + trail_idx;
}
else
{
string trail = _lexer.ReadRawString(0, length);
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
_lexer.Position = trail_idx;
}

if (trail_idx == -1)
return null; //TODO: Look for compressed xref table that should contain the trailer

ReadSymbol(Symbol.Trailer);
ReadSymbol(Symbol.BeginDictionary);
PdfTrailer trailer = new PdfTrailer(_document);
ReadDictionary(trailer, false);

// Recreate the xref table.
//
// When symbol == Symbol.Obj
// [0] - generation
// [1] - id
TokenInfo[] token_stack = new TokenInfo[2];
_lexer.Position = 0;
while (true)
{
Symbol symbol = ScanNextToken(out int position);
if (symbol == Symbol.Eof)
break;

// we need to skip over streams entirely
if (symbol == Symbol.BeginStream)
{
// We're not reading any data from the object so wee need to find endstream
int pos = _lexer.Position;
string trail = "";
int trail_pos = pos;
while (true)
{
// look for endstream in 1k chunks.
int trail_length = Math.Min(1024, length - trail_pos);
trail += _lexer.ReadRawString(trail_pos, trail_length);
int stop = trail.IndexOf("endstream", StringComparison.Ordinal);
if (stop != -1)
{
_lexer.Position = stop + pos;
break;
}

trail_pos = trail_pos + trail_length;
if (trail_pos + trail_length >= length)
{
// No endstream was found.
throw new Exception("endstream not found.");
}
}
}

if (symbol == Symbol.Obj &&
token_stack[0].Symbol == Symbol.Integer &&
token_stack[1].Symbol == Symbol.Integer)
{
PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number);
if (!xrefTable.Contains(objectID))
xrefTable.Add(new PdfReference(objectID, token_stack[1].Position));
//ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after
//SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions
}

token_stack[1] = token_stack[0];
TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position };
if (symbol == Symbol.Integer)
token_info.Number = _lexer.TokenToInteger;
token_stack[0] = token_info;
}

return trailer;
}

struct TokenInfo
{
public int Position;
public Symbol Symbol;
public int Number;
}

/// <summary>
/// Reads cross reference table(s) and trailer(s).
/// </summary>
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
{
Debug.Assert(xrefTable != null);

Expand Down