forked from dod-cyber-crime-center/DC3-MWCP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF.py
61 lines (52 loc) · 1.78 KB
/
PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
PDF
"""
import re
from mwcp import Parser, metadata
class Document(Parser):
"""
Parses PDF file with some basic metadata extraction.
"""
DESCRIPTION = "PDF Document"
AUTHOR = "DC3"
IGNORE_DOMAINS = [
b"www.w3.org",
b"ns.adobe.com",
b"purl.org",
]
# 2-6 character protocol -> :// -> Up to 253 alphanumeric, "-", "_", or "." characters, (which should include all
# valid domains or IP addresses) -> Nothing, or a port or "/" -> (For the port or "/") any non-whitespace characters.
URL_RE = re.compile(
b"[a-zA-Z]{2,6}" # scheme
b"://"
b"([\w._\-]+(:[\w._\-]+)?@)?" # user info
b"[\w._\-]{4,253}" # host
b"(:[\d]{1,5})?" # port
b"(/[\w._\-~=%]*)*" # path
b"(\?[\w._\-~=&,%]+)?" # query
b"(#[\w._\-~]+)?" # fragment
)
EMAIL_RE = re.compile(b"[\w.+-]+@([A-Za-z0-9](|[\w-]{0,61}[A-Za-z0-9])\.)+[A-Za-z]{2,6}")
@classmethod
def identify(cls, file_object):
return file_object.data.startswith(b"%PDF") and (
cls.URL_RE.search(file_object.data)
or cls.EMAIL_RE.search(file_object.data)
)
def extract_urls(self):
"""
Statically extract URLs embedded in the PDF.
"""
for match in self.URL_RE.finditer(self.file_object.data):
url = match.group()
if not any(domain in url for domain in self.IGNORE_DOMAINS):
self.report.add(metadata.URL(url))
def extract_emails(self):
"""
Statically extract URLs embedded in the PDF.
"""
for match in self.EMAIL_RE.finditer(self.file_object.data):
self.report.add(metadata.EmailAddress(match.group()))
def run(self):
self.extract_urls()
self.extract_emails()