-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfdump.py
46 lines (46 loc) · 1.01 KB
/
pdfdump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re, subprocess, argparse
#
def dump( path, stop_words=['references'] ):
#
cmd = 'pdftotext -enc ASCII7 -q -nopgbrk {} -'.format(path)
proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
paragraphs = []
for line in proc.stdout:
paragraphs.append(line.decode('ascii'))
#
stop = False
buffer = []
results = []
#
for paragraph in paragraphs:
for line in paragraph.split('\n'):
for stop_word in stop_words:
if stop_word.lower() in line.lower():
stop = True
break
if stop:
break
if line:
buffer.append(line)
#
if buffer and buffer[-1][-1] == '.':
line = ' '.join(buffer).replace('- ','')
for sentence in re.sub(r'\. ([A-Z])',r'.\n\1',line).split('\n'):
results.append(sentence)
buffer = []
#
if stop:
break
#
return results
#
if __name__ == '__main__':
#
parser = argparse.ArgumentParser()
parser.add_argument('path', help='PDF path')
args = parser.parse_args()
#
results = dump(args.path)
for line in results:
print( line )
print( '...' )