-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
142 lines (114 loc) · 4.78 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from typing import List, Dict
import re
import json
import pypdf
class Parser:
def __init__(self, filepath: str):
self.full_text = self.get_all_text(filepath)
self.spltd_text = self.full_text.split("I. Общие положения")
self.chapters = self.get_chapters()
def get_all_text(self, filepath: str) -> str:
all_text = ""
reader = pypdf.PdfReader(filepath)
number_of_pages = len(reader.pages)
print("Страниц в документе:", number_of_pages)
for i in range(number_of_pages):
page = reader.pages[i]
text = page.extract_text()
all_text += text
print("Всего символов в документе:", len(all_text))
return all_text
def get_chapters(self) -> List:
chapters = []
# spltd_text = self.full_text.split('I. Общие положения')
for i in range(len(self.spltd_text)):
chpt = self.spltd_text[i].splitlines()[-12:]
result_string = "".join(chpt)
words = re.findall(r"\b[А-ЯЁ]+\b", result_string)
result_string = " ".join(words)
if result_string:
chapters.append(result_string)
return chapters
def small_postprocessing(self):
def change_sub_text(text):
text = text.replace("\n", " ").split()
text = " ".join(text)
return text
self.spltd_text = [change_sub_text(text) for text in self.spltd_text[1:]]
def clean_text(self, text: str) -> str:
pattern = r"\([^)]*\)"
text = re.sub(pattern, "", text)
pattern = r"\d{1,2}\.\d{1,3}\s-\d{4}\."
text = re.sub(pattern, "", text)
pattern = re.sub(r"[^А-Яа-я\s]", "", text)
text = re.sub(pattern, "", text)
pattern = r"ст\.\s\d+"
text = re.sub(pattern, "", text)
pattern = r"N\s\d+"
text = re.sub(pattern, "", text)
pattern = r"\b\d{4}\b"
text = re.sub(pattern, "", text)
pattern = r"<\d+>"
text = re.sub(pattern, "", text)
pattern = r"\d{1,2}\s[а-я]{4,}"
text = re.sub(pattern, "", text)
splitted_sentences = text.split(".")
for i in range(len(splitted_sentences)):
if (
"ГОСТ" in splitted_sentences[i]
or "Абзац" in splitted_sentences[i]
or '"О' in splitted_sentences[i]
or "акт" in splitted_sentences
):
text = text.replace(splitted_sentences[i], " ")
text = text.replace("Российской Федерации", " ")
return text
def get_paragraph_splitted(self, paragraph: str) -> List:
clean_subpart = self.clean_text(paragraph)
splitted_parts = re.split(r"\d\.\s", clean_subpart)
stop_words = [
"Собрание законодательства",
"Зарегистрирован Министерством юстиции",
"Устав железнодорожного транспорта",
"-ФЗ",
"г. ",
"Подпункт",
]
for idx, sent in enumerate(splitted_parts):
for w in stop_words:
sent = sent.replace(w, " ")
sent = re.sub(r"[^\w\s.]", "", sent)
sent = re.sub(r"\s{2,}", " ", sent)
splitted_parts[idx] = sent
splitted_parts = [
sent for sent in splitted_parts if len(sent.replace(".", "")) > 10
]
return splitted_parts
def get_subtopics(self, topic_text: str) -> Dict:
subtopic2text = {}
matches = re.split(r"[IVXLCDM]+\.\s(.*?)(?=\d+\.)", topic_text)
matches = [match.strip() for match in matches if match.strip()]
matches = ["Общие положения"] + matches
subtopic2text = dict(zip(matches[::2], matches[1::2]))
for topic, text in subtopic2text.items():
topics_parts = self.get_paragraph_splitted(text)
subtopic2text[topic] = topics_parts
return subtopic2text
def prepare_dictionary(self) -> Dict:
# topic_level_0: {subtopic_1: [texts], subtopic_2: [texts]}
info_topics = {}
for topic_idx in range(len(self.chapters)):
info_topics[self.chapters[topic_idx]] = self.get_subtopics(
self.spltd_text[topic_idx]
)
return info_topics
def save_info(self, info):
with open("./generated_data.json", "w") as outfile:
json.dump(info, outfile, ensure_ascii=False)
if __name__ == "__main__":
# add argparse + save dir
filepath = "./rzd.pdf"
parser = Parser(filepath)
parser.small_postprocessing()
info = parser.prepare_dictionary()
parser.save_info(info)