Skip to content

dslipak/pdf

This branch is 13 commits ahead of, 7 commits behind ledongthuc/pdf:master.

Folders and files

NameName
Last commit message
Last commit date

Latest commit

636e0c0 · Jan 24, 2024

History

71 Commits
Jan 16, 2024
Oct 6, 2020
Feb 25, 2014
Oct 18, 2020
Feb 14, 2019
Oct 6, 2020
Feb 25, 2014
Jan 16, 2024
Jan 16, 2024
Oct 6, 2020
Jan 16, 2024
Feb 25, 2014

Repository files navigation

PDF Reader

A simple Go library which enables reading PDF files.

This repository contains merged code from https://github.com/rsc/pdf and https://github.com/ledongthuc/pdf

Features

  • Get plain text content (without formatting)
  • Get Content (including all font and formatting information)

Install:

go get -u github.com/dslipak/pdf

Read plain text

package main

import (
	"bytes"
	"fmt"

	"github.com/dslipak/pdf"
)

func main() {
	pdf.DebugOn = true
	content, err := readPdf("test.pdf") // Read local pdf file
	if err != nil {
		panic(err)
	}
	fmt.Println(content)
	return
}

func readPdf(path string) (string, error) {
	f, r, err := pdf.Open(path)
	// remember close file
    defer f.Close()
	if err != nil {
		return "", err
	}
	var buf bytes.Buffer
    b, err := r.GetPlainText()
    if err != nil {
        return "", err
    }
    buf.ReadFrom(b)
	return buf.String(), nil
}

Read all text with styles from PDF

func readPdf2(path string) (string, error) {
	f, r, err := pdf.Open(path)
	// remember close file
	defer f.Close()
	if err != nil {
		return "", err
	}
	totalPage := r.NumPage()

	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}
		var lastTextStyle pdf.Text
		texts := p.Content().Text
		for _, text := range texts {
			if isSameSentence(text, lastTextStyle) {
				lastTextStyle.S = lastTextStyle.S + text.S
			} else {
				fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
				lastTextStyle = text
			}
		}
	}
	return "", nil
}

Read text grouped by rows

package main

import (
	"fmt"
	"os"

	"github.com/dslipak/pdf"
)

func main() {
	content, err := readPdf(os.Args[1]) // Read local pdf file
	if err != nil {
		panic(err)
	}
	fmt.Println(content)
	return
}

func readPdf(path string) (string, error) {
	f, r, err := pdf.Open(path)
	defer func() {
		_ = f.Close()
	}()
	if err != nil {
		return "", err
	}
	totalPage := r.NumPage()

	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}

		rows, _ := p.GetTextByRow()
		for _, row := range rows {
		    println(">>>> row: ", row.Position)
		    for _, word := range row.Content {
		        fmt.Println(word.S)
		    }
		}
	}
	return "", nil
}