-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskell_downloader.py
169 lines (136 loc) · 5.26 KB
/
skell_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from __future__ import annotations
import dataclasses
import functools
import json
import urllib
import urllib.parse
import urllib.request
from enum import Enum
from typing import Any, Dict, List, Optional
from urllib.parse import quote
@dataclasses.dataclass
class SkellSentence:
left: str
kwic: str
right: str
def __str__(self) -> str:
return self.left + self.kwic + self.right
class SkellWordSketchKind(Enum):
ADVERB = "a"
CONJUNCTION = "c"
PRONOUN = "d"
ADJECTIVE = "j"
NOUN = "n"
PREPOSITION = "p"
VERB = "v"
@dataclasses.dataclass
class SkellCollocation:
word: str
lempos: str
collocation_pair: str
gram_rel: SkellGrammaticalRelation
def __str__(self) -> str:
return f"{self.__class__.__name__}({self.collocation_pair})"
class SkellGrammaticalRelation:
def __init__(self, word: str, desc: str, word_sketch: SkellWordSketch):
self.word = word
self.desc = desc
self.collocations: List[SkellCollocation] = []
self.word_sketch = word_sketch
def add_collocation(self, collocation: SkellCollocation) -> None:
self.collocations.append(collocation)
def __str__(self) -> str:
desc = f"{self.__class__.__name__}(collocations="
for col in self.collocations:
desc += f"\t{col}\n"
desc += ")"
return desc
class SkellWordSketch:
def __init__(self, word: str, kind: SkellWordSketchKind):
self.kind = kind
self.word = word
self.gram_rels: List[SkellGrammaticalRelation] = []
def add_gram_rel(self, rel: SkellGrammaticalRelation) -> None:
self.gram_rels.append(rel)
def __str__(self) -> str:
desc = f"{self.__class__.__name__}(rels="
for rel in self.gram_rels:
desc += f"\t{rel}\n"
desc += ")"
return desc
class SkellDownloader:
# Languages supported by SKELL
langs = ["English", "German", "Italian", "Czech", "Estonian"]
def __init__(self, lang: str = "English"):
self.lang = lang
def _get_json(self, url: str) -> Any:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req) as res:
return json.loads(res.read())
def _get_lines_from_data(self, data: Dict) -> List[SkellSentence]:
lines = []
for line in data.get("Lines", []):
left = self._join_line_component_list(line, "Left")
kwic = self._join_line_component_list(line, "Kwic")
right = self._join_line_component_list(line, "Right")
lines.append(SkellSentence(left, kwic, right))
return lines
@functools.lru_cache
def get_examples(self, word: str) -> List[SkellSentence]:
data = self._get_json(
f"https://skell.sketchengine.eu/api/run.cgi/concordance?query={quote(word)}&lang={self.lang}&format=json"
)
sentences = self._get_lines_from_data(data)
return sentences
def _join_line_component_list(self, line: Dict, key: str) -> str:
return "".join(d.get("Str", "") for d in line.get(key, []))
def get_word_sketch(
self, word: str, kind: Optional[SkellWordSketchKind] = None
) -> SkellWordSketch:
lpos = f"&lpos=-{kind.value}" if kind else ""
data = self._get_json(
f"https://skell.sketchengine.eu/api/run.cgi/wordsketch?lang={self.lang}&query={quote(word)}&format=json"
+ lpos
)
word_sketch = SkellWordSketch(word, kind)
for rel_data in data.get("GramRels", []):
rel = SkellGrammaticalRelation(word, rel_data.get("Name", ""), word_sketch)
for word_data in rel_data.get("Words", []):
collocation = SkellCollocation(
word_data.get("Word", ""),
word_data.get("Lempos", ""),
word_data.get("Cm", ""),
rel,
)
rel.add_collocation(collocation)
word_sketch.add_gram_rel(rel)
return word_sketch
def get_concordances_from_collocation(
self, collocation: SkellCollocation
) -> List[SkellSentence]:
rel = collocation.gram_rel
data = self._get_json(
f"https://skell.sketchengine.eu/api/run.cgi/wordsketch_concordance?headword={rel.word}-{rel.word_sketch.kind.value}&lang={self.lang}&coll={collocation.lempos}&gramrel={quote(rel.desc)}&format=json"
)
sentences = self._get_lines_from_data(data)
return sentences
# def get_concordances(self, combined_query: str):
# pass
def get_similar_words(self, word: str) -> List[str]:
data = self._get_json(
f"https://skell.sketchengine.eu/api/run.cgi/thesaurus?lang={self.lang}&query={quote(word)}&format=json"
)
words = []
for word_data in data.get("Words", []):
words.append(word_data.get("Word"))
return words
if __name__ == "__main__":
downloader = SkellDownloader()
word = "good"
print(f"***** examples of {word} *****")
for s in downloader.get_examples(word):
print(s)
print(f"***** word sketch of {word} *****")
word_sketch = downloader.get_word_sketch(word)
print(word_sketch)
print(f"***** word sketch concordances of {word} *****")