-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRuleBasedSentenceSplitter.py
160 lines (129 loc) · 6.62 KB
/
RuleBasedSentenceSplitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from enum import Enum
from typing import List
import warnings
import regex
PATH = "DATA"
# Based on the scripts developed by Philipp Koehn and Josh Schroeder: https://pypi.org/project/sentence-splitter/
# Simplified for custom needs and Turkish language
class RuleBasedSentenceSplitter():
class PrefixType(Enum):
DEFAULT = 1
NUMERIC_ONLY = 2
def __init__(self): # ISO 639-1 language code
self.non_breaking_prefixes = dict()
with open(PATH + "/" + 'non_breaking_prefixes_tr.txt', mode='r', encoding='utf-8') as prefix_file:
for line in prefix_file.readlines():
if '#NUMERIC_ONLY#' in line:
prefix_type = RuleBasedSentenceSplitter.PrefixType.NUMERIC_ONLY
else:
prefix_type = RuleBasedSentenceSplitter.PrefixType.DEFAULT
# non_brekaing_prefixes_tr file contains comments for ease of read
# so this part removes them
line = regex.sub(pattern=r'#.*', repl='', string=line, flags=regex.DOTALL | regex.UNICODE)
line = line.strip()
if not line:
continue
self.non_breaking_prefixes[line] = prefix_type
# lower level function used by the class to split given string into list of strings, thus sentences
def split(self, text):
if text is None:
warnings.warn("Text is None.", SentenceSplitterWarning)
return []
if not text:
return []
# Sentence Breaker Rules:
# Sentence markes such as "?", "!" that are not period, followed by sentence starter
text = regex.sub(
pattern=r'([?!]) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])',
repl='\\1\n\\2',
string=text,
flags=regex.UNICODE
)
# Multiple dots ("...") followed by sentence starter
text = regex.sub(
pattern=r'(\.[\.]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])',
repl='\\1\n\\2',
string=text,
flags=regex.UNICODE
)
# Sentence ending with punctuation, within quotation marks or parenthesis, followed by sentence starter punctuation and upper case
text = regex.sub(
pattern=(
r'([?!\.][\ ]*[\'")\]\p{Final_Punctuation}]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\ ]*'
r'[\p{Uppercase_Letter}\p{Other_Letter}])'
),
repl='\\1\n\\2',
string=text,
flags=regex.UNICODE
)
# Sentence ending with punctuation and followed by sentence starter punctuation and capital letter
text = regex.sub(
pattern=(
r'([?!\.]) +([\'"[\u00bf\u00A1\p{Initial_Punctuation}]+[\ ]*[\p{Uppercase_Letter}\p{Other_Letter}])'
),
repl='\\1\n\\2',
string=text,
flags=regex.UNICODE
)
# Special punctuation cases
words = regex.split(pattern=r' +', string=text, flags=regex.UNICODE)
text = ''
for i in range(0, len(words) - 1):
match = regex.search(pattern=r'([\w\.\-]*)([\'\"\)\]\%\p{Final_Punctuation}]*)(\.+)$',
string=words[i],
flags=regex.UNICODE)
if match:
prefix = match.group(1)
starting_punct = match.group(2)
def is_honorific_prefix(prefix_, starting_punct_):
"""Check if \\1 is a known honorific and \\2 is empty."""
if prefix_:
if prefix_ in self.non_breaking_prefixes:
if self.non_breaking_prefixes[prefix_] == RuleBasedSentenceSplitter.PrefixType.DEFAULT:
if not starting_punct_:
return True
return False
if is_honorific_prefix(prefix_=prefix, starting_punct_=starting_punct):
# Not breaking
pass
elif regex.search(pattern=r'(\.)[\p{Uppercase_Letter}\p{Other_Letter}\-]+(\.+)$',
string=words[i],
flags=regex.UNICODE):
# Not breaking - upper case acronym
pass
elif regex.search(
pattern=(
r'^([ ]*[\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[ ]*[\p{Uppercase_Letter}'
r'\p{Other_Letter}0-9])'
),
string=words[i + 1],
flags=regex.UNICODE
):
def is_numeric(prefix_, starting_punct_, next_word):
"""The next word has a bunch of initial quotes, maybe a space, then either upper case or a
number."""
if prefix_:
if prefix_ in self.non_breaking_prefixes:
if self.non_breaking_prefixes[prefix_] == RuleBasedSentenceSplitter.PrefixType.NUMERIC_ONLY:
if not starting_punct_:
if regex.search(pattern='^[0-9]+', string=next_word, flags=regex.UNICODE):
return True
return False
if not is_numeric(prefix_=prefix, starting_punct_=starting_punct, next_word=words[i + 1]):
words[i] = words[i] + "\n"
# A return is always added unless there is a numeric non-breaker and a number start
text = text + words[i] + " "
# Stopped one token away from the end, so that easy look-ahead is possible. Then appended.
text = text + words[-1]
# White spaces at the head and tail are removed.
# Double white spaces are also removed.
text = regex.sub(pattern=' +', repl=' ', string=text)
text = regex.sub(pattern='\n ', repl='\n', string=text)
text = regex.sub(pattern=' \n', repl='\n', string=text)
text = text.strip()
sentences = text.split('\n')
return sentences
# higher level function that is called by the user to split
def split_text_into_sentences(self, text):
splitter = RuleBasedSentenceSplitter()
return splitter.split(text=text)