diff --git a/.gitignore b/.gitignore index f32e31a..4a62f47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ .DS_Store +pdf_test.go diff --git a/README.md b/README.md index 477408c..2b01429 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Features ## Install: -`go get -u github.com/ledongthuc/pdf` +`go get -u github.com/dslipak/pdf` ## Read plain text @@ -20,7 +20,7 @@ import ( "bytes" "fmt" - "github.com/ledongthuc/pdf" + "github.com/dslipak/pdf" ) func main() { @@ -92,7 +92,7 @@ import ( "fmt" "os" - "github.com/dcu/pdf" + "github.com/dslipak/pdf" ) func main() { diff --git a/lex.go b/lex.go index ee73fd9..652e18b 100644 --- a/lex.go +++ b/lex.go @@ -7,6 +7,7 @@ package pdf import ( + "errors" "fmt" "io" "strconv" @@ -78,11 +79,12 @@ func (b *buffer) readByte() byte { return c } -func (b *buffer) errorf(format string, args ...interface{}) { - panic(fmt.Errorf(format, args...)) +func (b *buffer) errorf(format string, args ...interface{}) string { + // panic(fmt.Errorf(format, args...)) + return fmt.Sprintf(format, args...) } -func (b *buffer) reload() bool { +func (b *buffer) reload() (bool, error) { n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) n, err := b.r.Read(b.buf[:n]) if n == 0 && err != nil { @@ -90,24 +92,29 @@ func (b *buffer) reload() bool { b.pos = 0 if b.allowEOF && err == io.EOF { b.eof = true - return false + return false, err } - b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err) - return false + fmt.Sprint(b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)) + return false, err } b.offset += int64(n) b.buf = b.buf[:n] b.pos = 0 - return true + return true, err } -func (b *buffer) seekForward(offset int64) { +func (b *buffer) seekForward(offset int64) (err error) { for b.offset < offset { - if !b.reload() { - return + rel, err := b.reload() + if err != nil { + return err + } + if !rel { + return err } } b.pos = len(b.buf) - int(b.offset-offset) + return err } func (b *buffer) readOffset() int64 { @@ -160,7 +167,7 @@ func (b *buffer) readToken() token { return b.readLiteralString() case '[', ']', '{', '}': - return keyword(string(c)) + return keyword(c) case '/': return b.readName() @@ -174,8 +181,9 @@ func (b *buffer) readToken() token { default: if isDelim(c) { - b.errorf("unexpected delimiter %#q", rune(c)) - return nil + // b.errorf("unexpected delimiter %#q", rune(c)) + return b.errorf("unexpected delimiter %#q", rune(c)) + // return nil } b.unreadByte() return b.readKeyword() @@ -200,7 +208,7 @@ func (b *buffer) readHexString() token { } x := unhex(c)<<4 | unhex(c2) if x < 0 { - b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) + fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])) break } tmp = append(tmp, byte(x)) @@ -241,7 +249,8 @@ Loop: case '\\': switch c = b.readByte(); c { default: - b.errorf("invalid escape sequence \\%c", c) + // b.errorf("invalid escape sequence \\%c", c) + fmt.Sprint(b.errorf("invalid escape sequence \\%c", c)) tmp = append(tmp, '\\', c) case 'n': tmp = append(tmp, '\n') @@ -294,7 +303,8 @@ func (b *buffer) readName() token { if c == '#' { x := unhex(b.readByte())<<4 | unhex(b.readByte()) if x < 0 { - b.errorf("malformed name") + // b.errorf("malformed name") + fmt.Sprint(b.errorf("malformed name")) } tmp = append(tmp, byte(x)) continue @@ -325,13 +335,15 @@ func (b *buffer) readKeyword() token { case isInteger(s): x, err := strconv.ParseInt(s, 10, 64) if err != nil { - b.errorf("invalid integer %s", s) + // b.errorf("invalid integer %s", s) + fmt.Sprint(b.errorf("invalid integer %s", s)) } return x case isReal(s): x, err := strconv.ParseFloat(s, 64) if err != nil { - b.errorf("invalid real %s", s) + // b.errorf("invalid real %s", s) + fmt.Sprint(b.errorf("invalid real %s", s)) } return x } @@ -409,19 +421,20 @@ type objdef struct { obj object } -func (b *buffer) readObject() object { +func (b *buffer) readObject() (object, error) { tok := b.readToken() if kw, ok := tok.(keyword); ok { switch kw { case "null": - return nil + return nil, nil case "<<": - return b.readDict() + return b.readDict(), nil case "[": - return b.readArray() + return b.readArray(), nil } - b.errorf("unexpected keyword %q parsing object", kw) - return nil + // b.errorf("unexpected keyword %q parsing object", kw) + return nil, errors.New(b.errorf("unexpected keyword %q parsing object", kw)) + // return nil } if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 { @@ -429,7 +442,7 @@ func (b *buffer) readObject() object { } if !b.allowObjptr { - return tok + return tok, nil } if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 { @@ -438,26 +451,30 @@ func (b *buffer) readObject() object { tok3 := b.readToken() switch tok3 { case keyword("R"): - return objptr{uint32(t1), uint16(t2)} + return objptr{uint32(t1), uint16(t2)}, nil case keyword("obj"): old := b.objptr b.objptr = objptr{uint32(t1), uint16(t2)} - obj := b.readObject() + obj, err := b.readObject() + if err != nil { + return nil, err + } if _, ok := obj.(stream); !ok { tok4 := b.readToken() if tok4 != keyword("endobj") { - b.errorf("missing endobj after indirect object definition") + // b.errorf("missing endobj after indirect object definition") + fmt.Sprint(b.errorf("missing endobj after indirect object definition")) b.unreadToken(tok4) } } b.objptr = old - return objdef{objptr{uint32(t1), uint16(t2)}, obj} + return objdef{objptr{uint32(t1), uint16(t2)}, obj}, err } b.unreadToken(tok3) } b.unreadToken(tok2) } - return tok + return tok, nil } func (b *buffer) readArray() object { @@ -468,7 +485,11 @@ func (b *buffer) readArray() object { break } b.unreadToken(tok) - x = append(x, b.readObject()) + res, err := b.readObject() + if err != nil { + return err + } + x = append(x, res) } return x } @@ -482,10 +503,15 @@ func (b *buffer) readDict() object { } n, ok := tok.(name) if !ok { - b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) + // b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) + fmt.Sprint(b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)) continue } - x[n] = b.readObject() + res, err := b.readObject() + if err != nil { + return nil + } + x[n] = res } if !b.allowStream { @@ -506,7 +532,8 @@ func (b *buffer) readDict() object { case '\n': // ok default: - b.errorf("stream keyword not followed by newline") + // b.errorf("stream keyword not followed by newline") + return b.errorf("stream keyword not followed by newline") } return stream{x, b.objptr, b.readOffset()} diff --git a/pdf_test.go b/pdf_test.go new file mode 100644 index 0000000..345ee95 --- /dev/null +++ b/pdf_test.go @@ -0,0 +1,48 @@ +package pdf + +import ( + "bytes" + "fmt" + "testing" +) + +const testFile = "/Users/dslipak/Documents/dslipak-20190925.pdf" + +func TestReadPdf(t *testing.T) { + f, err := Open(testFile) + if err != nil { + t.Error("Doc should not be nil', got ", err) + } + + totalPage := f.NumPage() + var buf bytes.Buffer + + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := f.Page(pageIndex) + if p.V.IsNull() { + continue + } + + texts := p.Content().Text + var lastY = 0.0 + line := "" + + for _, text := range texts { + if lastY != text.Y { + if lastY > 0 { + buf.WriteString(line + "\n") + line = text.S + } else { + line += text.S + } + } else { + line += text.S + } + + lastY = text.Y + } + buf.WriteString(line) + } + fmt.Println(buf.String()) +} + diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go deleted file mode 100644 index 57fa88f..0000000 --- a/pdfpasswd/main.go +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Pdfpasswd searches for the password for an encrypted PDF -// by trying all strings over a given alphabet up to a given length. -package main - -import ( - "flag" - "fmt" - "log" - "os" - - "github.com/ledongthuc/pdf" -) - -var ( - alphabet = flag.String("a", "0123456789", "alphabet") - maxLength = flag.Int("m", 4, "max length") -) - -func usage() { - fmt.Fprintf(os.Stderr, "usage: pdfpasswd [-a alphabet] [-m maxlength] file\n") - os.Exit(2) -} - -func main() { - log.SetFlags(0) - log.SetPrefix("pdfpasswd: ") - - flag.Usage = usage - flag.Parse() - if flag.NArg() != 1 { - usage() - } - - f, err := os.Open(flag.Arg(0)) - if err != nil { - log.Fatal(err) - } - - last := "" - alpha := *alphabet - ctr := make([]int, *maxLength) - pw := func() string { - inc(ctr, len(alpha)+1) - for !valid(ctr) { - inc(ctr, len(alpha)+1) - } - if done(ctr) { - return "" - } - buf := make([]byte, len(ctr)) - var i int - for i = 0; i < len(buf); i++ { - if ctr[i] == 0 { - break - } - buf[i] = alpha[ctr[i]-1] - } - last = string(buf[:i]) - println(last) - return last - } - st, err := f.Stat() - if err != nil { - log.Fatal(err) - } - _, err = pdf.NewReaderEncrypted(f, st.Size(), pw) - if err != nil { - if err == pdf.ErrInvalidPassword { - log.Fatal("password not found") - } - log.Fatal("reading pdf: %v", err) - } - fmt.Printf("password: %q\n", last) -} - -func inc(ctr []int, n int) { - for i := 0; i < len(ctr); i++ { - ctr[i]++ - if ctr[i] < n { - break - } - ctr[i] = 0 - } -} - -func done(ctr []int) bool { - for _, x := range ctr { - if x != 0 { - return false - } - } - return true -} - -func valid(ctr []int) bool { - i := len(ctr) - for i > 0 && ctr[i-1] == 0 { - i-- - } - for i--; i >= 0; i-- { - if ctr[i] == 0 { - return false - } - } - return true -} diff --git a/ps.go b/ps.go index 90c551e..c7ec20e 100644 --- a/ps.go +++ b/ps.go @@ -59,6 +59,7 @@ func Interpret(strm Value, do func(stk *Stack, op string)) { b.allowStream = false var stk Stack var dicts []dict + Reading: for { tok := b.readToken() @@ -118,7 +119,10 @@ Reading: } } b.unreadToken(tok) - obj := b.readObject() + obj, err := b.readObject() + if err != nil { + return + } stk.Push(Value{nil, objptr{}, obj}) } } diff --git a/read.go b/read.go index da5f76f..7905639 100644 --- a/read.go +++ b/read.go @@ -44,7 +44,7 @@ // the package. Equally important, traversal of other PDF data structures can be implemented // in other packages as needed. // -package pdf // import "rsc.io/pdf" +package pdf // BUG(rsc): The package is incomplete, although it has been used successfully on some // large real-world PDF files. @@ -190,6 +190,9 @@ func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, e // Trailer returns the file's Trailer value. func (r *Reader) Trailer() Value { + if r == nil { + return Value{} + } return Value{r, r.trailerptr, r.trailer} } @@ -206,7 +209,10 @@ func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { } func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { - obj1 := b.readObject() + obj1, err := b.readObject() + if err != nil { + return nil, objptr{}, nil, err + } obj, ok := obj1.(objdef) if !ok { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1)) @@ -225,7 +231,7 @@ func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { } table := make([]xref, size) - table, err := readXrefStreamData(r, strm, table, size) + table, err = readXrefStreamData(r, strm, table, size) if err != nil { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) } @@ -236,7 +242,10 @@ func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) } b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) - obj1 := b.readObject() + obj1, err := b.readObject() + if err != nil { + return nil, objptr{}, nil, err + } obj, ok := obj1.(objdef) if !ok { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1)) @@ -353,7 +362,12 @@ func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) } - trailer, ok := b.readObject().(dict) + res, err := b.readObject() + if err != nil { + return nil, objptr{}, nil, err + } + + trailer, ok := res.(dict) if !ok { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary") } @@ -373,7 +387,11 @@ func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) } - trailer, ok := b.readObject().(dict) + res, err := b.readObject() + if err != nil { + return nil, objptr{}, nil, err + } + trailer, ok := res.(dict) if !ok { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary") } @@ -716,7 +734,7 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { if xref.ptr != ptr || !xref.inStream && xref.offset == 0 { return Value{} } - var obj object + // var obj object if xref.inStream { strm := r.resolve(parent, xref.stream) Search: @@ -739,7 +757,10 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { off, _ := b.readToken().(int64) if uint32(id) == ptr.id { b.seekForward(first + off) - x = b.readObject() + _, err := b.readObject() + if err != nil { + return Value{} + } break Search } } @@ -753,7 +774,10 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset) b.key = r.key b.useAES = r.useAES - obj = b.readObject() + obj, err := b.readObject() + if err != nil { + return Value{} + } def, ok := obj.(objdef) if !ok { panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)) @@ -773,7 +797,9 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { case string: return Value{r, parent, x} default: - panic(fmt.Errorf("unexpected value type %T in resolve", x)) + // panic(fmt.Errorf("unexpected value type %T in resolve", x)) + fmt.Sprintf("unexpected value type %T in resolve", x) + return Value{} } }