-
Notifications
You must be signed in to change notification settings - Fork 0
/
EXTRACT-PDF.2.EXCEL.py
66 lines (66 loc) · 3.66 KB
/
EXTRACT-PDF.2.EXCEL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#--
#-- ************************************************************************************************************:
#-- ********************************************* EXTRACT PDF TABLES *******************************************:
#-- ************************************************************************************************************:
#-- Author: JBALLARD (JEB) :
#-- Date: 2023.2.18 :
#-- Script: EXTRACT-PDF.2.EXCEL.py :
#-- Purpose: A python script that extracts PDF tables to EXCEL. :
#-- Class: python -m pip install Spire.Pdf; python3 -m pip install Spire.Pdf :
#-- Class: python -m pip install Spire.Xls; python3 -m pip install Spire.Xls :
#-- Version: 1.0 :
#-- ************************************************************************************************************:
#-- ************************************************************************************************************:
#--
#-- ********************************************************:
#-- DEFINE PARAMS, CONSTANTS, CONFIG PATHS, IMPORT CLASSES :
#-- ********************************************************:
from spire.pdf import *
from spire.xls import *
#--
#-- DEFINE FUNCTION TO EXTRACT TABLE DATA TO PDF:
def extract_table_data_to_excel(pdf_path, xls_path):
doc = PdfDocument()
try:
#-- LOAD PDF PATH:
doc.LoadFromFile(pdf_path)
#-- POPULATE LIST TO STORE PDF TABLE DATA:
extractor = PdfTableExtractor(doc)
workbook = Workbook()
workbook.Worksheets.Clear()
#--
#-- ITERATE THROUGH PDF DOCUMENT:
for page_index in range(doc.Pages.Count):
#-- RETRIEVE PDF TABLES:
tables = extractor.ExtractTable(page_index)
if tables is not None and len(tables) > 0:
#--
#-- ITERATE THROUGH TABLES:
for table_index, table in enumerate(tables):
#-- BUILD NEW EXCEL WORKSHEET FOR EACH EXTRACTED PDF TABLE:
worksheet = workbook.CreateEmptySheet()
worksheet.Name = f"Table {table_index + 1} of Page {page_index + 1}"
row_count = table.GetRowCount()
col_count = table.GetColumnCount()
#--
#-- EXTRACT DATA FROM EACH PDF TABLE & APPEND TO TABLE_DATA LIST:
for row_index in range(row_count):
for column_index in range(col_count):
data = table.GetText(row_index, column_index)
worksheet.Range[row_index + 1, column_index + 1].Value = data.strip()
#-- ADJUST EXCEL COLUMNS WIDTH:
worksheet.Range.AutoFitColumns()
#-- SAVE EXCEL WORKBOOK TO PATH:
workbook.SaveToFile(xls_path, ExcelVersion.Version2016)
#--
except Exception as e:
print(f"Error occurred: {str(e)}")
#--
#-- USE EXAMPLE:
pdf_path = "Tables.pdf"
xls_path = "table_data.xlsx"
extract_table_data_to_excel(pdf_path, xls_path)
#--
#-- ********************************************************:
#-- END OF PYTHON SCRIPT :
#-- ********************************************************: