-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdf_extract.py
64 lines (57 loc) · 2.35 KB
/
pdf_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import PyPDF2
import argparse
import os
def get_attachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:param reader: PyPDF2.PdfFileReader object
:return: dictionary of filenames and bytestrings
"""
catalog = reader.trailer["/Root"]
if '/Names' in catalog and '/EmbeddedFiles' in catalog['/Names'] and '/Names' in catalog['/Names']['/EmbeddedFiles']:
file_names = catalog['/Names']['/EmbeddedFiles']['/Names']
attachments = {}
for f in file_names:
if isinstance(f, str):
name = f
data_index = file_names.index(f) + 1
f_dict = file_names[data_index].getObject()
f_data = f_dict['/EF']['/F'].getData()
attachments[name] = f_data
return attachments
else:
return {}
def save_attachments(attachments, output_dir):
"""
Saves the attachments to the specified directory.
:param attachments: dictionary of file data
:param output_dir: directory to save files
"""
for file_name, file_data in attachments.items():
output_path = os.path.join(output_dir, file_name)
with open(output_path, 'wb') as outfile:
outfile.write(file_data)
print(f"File saved: {output_path}")
def main():
parser = argparse.ArgumentParser(description="Extract attachments from a PDF file.")
parser.add_argument("pdf_file", help="Path to the PDF file")
parser.add_argument("-o", "--output", default=".", help="Output directory for extracted files")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase output verbosity")
args = parser.parse_args()
if args.verbose:
print(f"Opening PDF file: {args.pdf_file}")
try:
with open(args.pdf_file, 'rb') as handler:
reader = PyPDF2.PdfFileReader(handler)
if args.verbose:
print("Extracting attachments...")
attachments = get_attachments(reader)
if attachments:
save_attachments(attachments, args.output)
else:
print("No attachments found in the PDF.")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()