-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
60 lines (50 loc) · 1.72 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import csv
import re
from sklearn.cluster import KMeans
import pdfplumber
def check(strr):
my_re = re.compile(r'[A-Za-z]', re.S)
res = re.findall(my_re, strr)
if len(res):
return True
else:
return False
def dataset(filepath):
nname = []
iinformation = []
with open(filepath, 'r+') as f:
readers = csv.reader(f, delimiter=",")
x = list(readers)
data = np.array(x)
for line in data:
nname.append(line[0])
iinformation.append([np.double(line[i]) for i in range(1, len(line))])
return nname, iinformation
def test_one(file_path):
pdf = pdfplumber.open(file_path)
page = pdf.pages[0]
word = page.extract_words(y_tolerance=-1)
with open("output_csv/" + pdf.metadata['Title'] + ".csv", 'w', newline='') as f:
for wword in word:
if check(wword['text']):
continue
elif wword['text'].__contains__('.'):
row = list(wword.values())[0:6]
write = csv.writer(f)
write.writerow(row)
name, information = dataset("output_csv/" + pdf.metadata['Title'] + ".csv")
n_clusters = 11
km = KMeans(n_clusters=n_clusters)
label = km.fit_predict(information)
Cluster = [[] for _ in range(n_clusters)]
for i in range(len(name)):
Cluster[label[i]].append(name[i])
# for i in range(len(Cluster)):
# print(Cluster[i])
with open("result_csv/" + pdf.metadata['Title'] + ".csv", 'w', newline='') as f:
for cluster in Cluster:
write = csv.writer(f)
write.writerow(cluster)
if __name__ == '__main__':
pass