From 59daaeef5988d039a18bcda26e7b40f2f4f7ea7f Mon Sep 17 00:00:00 2001 From: lashoun Date: Tue, 3 Mar 2020 09:28:09 +0100 Subject: [PATCH 1/5] update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5d581a7..7929e76 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ wiktionaryparser/notes.txt notes.txt +tests.ipynb # Byte-compiled / optimized / DLL files __pycache__/ @@ -110,4 +111,4 @@ venv.bak/ # mypy .mypy_cache/ .dmypy.json -dmypy.json \ No newline at end of file +dmypy.json From c911188b3943f862a59d9a8c016c89d769b2b2bb Mon Sep 17 00:00:00 2001 From: lashoun Date: Tue, 3 Mar 2020 09:28:27 +0100 Subject: [PATCH 2/5] added hardcoded french support --- wiktionaryparser.py | 83 ++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/wiktionaryparser.py b/wiktionaryparser.py index bcc68d2..7cfd2ee 100644 --- a/wiktionaryparser.py +++ b/wiktionaryparser.py @@ -1,24 +1,45 @@ -import re, requests +import re +import requests from utils import WordData, Definition, RelatedWord from bs4 import BeautifulSoup from itertools import zip_longest from copy import copy from string import digits -PARTS_OF_SPEECH = [ - "noun", "verb", "adjective", "adverb", "determiner", - "article", "preposition", "conjunction", "proper noun", - "letter", "character", "phrase", "proverb", "idiom", - "symbol", "syllable", "numeral", "initialism", "interjection", - "definitions", "pronoun", "particle", "predicative", "participle", - "suffix", -] +PARTS_OF_SPEECH = { + "english": [ + "noun", "verb", "adjective", "adverb", "determiner", + "article", "preposition", "conjunction", "proper noun", + "letter", "character", "phrase", "proverb", "idiom", + "symbol", "syllable", "numeral", "initialism", "interjection", + "definitions", "pronoun", "particle", "predicative", "participle", + "suffix" + ], + "français": [ + "nom commun", "verbe", "adjectif", "adverbe", "déterminant", + "article", "preposition", "conjonction", "nom propre", + "lettre", "caractère", "expression", "proverbe", "idiome", + "symbole", "syllabe", "nombre", "acronyme", "interjection", + "définitions", "pronom", "particule", "prédicat", "participe", + "suffixe", "locution nominale" + ], +} + +RELATIONS = { + "english": [ + "synonyms", "antonyms", "hypernyms", "hyponyms", + "meronyms", "holonyms", "troponyms", "related terms", + "coordinate terms", + ], + "français": [ + "synonymes", "antonymes", "hypéronymes", "hyponymes", + "méronymes", "holonymes", "paronymes", "troponymes", + "vocabulaire apparenté par le sens", "dérivés", + "anagrammes", "proverbes et phrases toutes faites", + "apparentés étymologiques", "quasi-synonymes" + ] +} -RELATIONS = [ - "synonyms", "antonyms", "hypernyms", "hyponyms", - "meronyms", "holonyms", "troponyms", "related terms", - "coordinate terms", -] def is_subheading(child, parent): child_headings = child.split(".") @@ -30,18 +51,27 @@ def is_subheading(child, parent): return False return True + class WiktionaryParser(object): - def __init__(self): - self.url = "https://en.wiktionary.org/wiki/{}?printable=yes" + def __init__(self, language="français"): self.soup = None self.session = requests.Session() - self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2)) - self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2)) - self.language = 'english' + self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries=2)) + self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries=2)) self.current_word = None - self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH) - self.RELATIONS = copy(RELATIONS) - self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation'] + + if language == "français": + self.language = 'français' + self.url = "https://fr.wiktionary.org/wiki/{}?printable=yes" + self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH["français"]) + self.RELATIONS = copy(RELATIONS["français"]) + self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['étymologie', 'prononciation'] + else: + self.language = 'english' + self.url = "https://en.wiktionary.org/wiki/{}?printable=yes" + self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH["english"]) + self.RELATIONS = copy(RELATIONS["english"]) + self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation'] def include_part_of_speech(self, part_of_speech): part_of_speech = part_of_speech.lower() @@ -86,8 +116,12 @@ def count_digits(self, string): def get_id_list(self, contents, content_type): if content_type == 'etymologies': checklist = ['etymology'] + if self.language == "français": + checklist = ['étymologie'] elif content_type == 'pronunciation': checklist = ['pronunciation'] + if self.language == "français": + checklist = ['prononciation'] elif content_type == 'definitions': checklist = self.PARTS_OF_SPEECH if self.language == 'chinese': @@ -192,7 +226,10 @@ def parse_examples(self, word_contents): table = table.find_next_sibling() examples = [] while table and table.name == 'ol': - for element in table.find_all('dd'): + example_delim = 'dd' + if self.language == "français": + example_delim = 'i' + for element in table.find_all(example_delim): example_text = re.sub(r'\([^)]*\)', '', element.text.strip()) if example_text: examples.append(example_text) From f2035992a05f23ba39ed0c98d397683e60301ae7 Mon Sep 17 00:00:00 2001 From: lashoun Date: Tue, 3 Mar 2020 09:31:39 +0100 Subject: [PATCH 3/5] update readme --- readme.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 8905e96..cdac2f2 100644 --- a/readme.md +++ b/readme.md @@ -2,6 +2,7 @@ A python project which parses word content from Wiktionary in an easy to use JSON format. Right now, it parses etymologies, definitions, pronunciations, examples, audio links and related words. +Only English and French Wiktionaries are supported. [![Downloads](http://pepy.tech/badge/wiktionaryparser)](http://pepy.tech/project/wiktionaryparser) @@ -54,6 +55,10 @@ Right now, it parses etymologies, definitions, pronunciations, examples, audio l >>> parser.set_default_language('french') >>> parser.exclude_part_of_speech('noun') >>> parser.include_relation('alternative forms') +>>> +>>> parser_fr = WiktionaryParser(language="français") +>>> word = parser_fr.fetch('test') +>>> word = parser_fr.fetch('test', 'anglais') ``` #### Requirements @@ -67,4 +72,4 @@ If you want to add features/improvement or report issues, feel free to send a pu #### License -Wiktionary Parser is licensed under [MIT](LICENSE.txt). \ No newline at end of file +Wiktionary Parser is licensed under [MIT](LICENSE.txt). From b2c21e108a0dc11ad761596608d55c111c3c878c Mon Sep 17 00:00:00 2001 From: lashoun Date: Tue, 3 Mar 2020 09:36:17 +0100 Subject: [PATCH 4/5] made english default language --- wiktionaryparser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wiktionaryparser.py b/wiktionaryparser.py index 7cfd2ee..7badcec 100644 --- a/wiktionaryparser.py +++ b/wiktionaryparser.py @@ -53,7 +53,7 @@ def is_subheading(child, parent): class WiktionaryParser(object): - def __init__(self, language="français"): + def __init__(self, language="english"): self.soup = None self.session = requests.Session() self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries=2)) @@ -67,6 +67,8 @@ def __init__(self, language="français"): self.RELATIONS = copy(RELATIONS["français"]) self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['étymologie', 'prononciation'] else: + if language != "english": + print("language unsupported, switched to english") self.language = 'english' self.url = "https://en.wiktionary.org/wiki/{}?printable=yes" self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH["english"]) From 1497d7debbd27783bbe9996529f1a53f2f2de988 Mon Sep 17 00:00:00 2001 From: lashoun Date: Tue, 3 Mar 2020 09:37:27 +0100 Subject: [PATCH 5/5] fixed readme --- readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index cdac2f2..9c2eb1d 100644 --- a/readme.md +++ b/readme.md @@ -55,7 +55,7 @@ Only English and French Wiktionaries are supported. >>> parser.set_default_language('french') >>> parser.exclude_part_of_speech('noun') >>> parser.include_relation('alternative forms') ->>> + >>> parser_fr = WiktionaryParser(language="français") >>> word = parser_fr.fetch('test') >>> word = parser_fr.fetch('test', 'anglais') @@ -72,4 +72,4 @@ If you want to add features/improvement or report issues, feel free to send a pu #### License -Wiktionary Parser is licensed under [MIT](LICENSE.txt). + Wiktionary Parser is licensed under [MIT](LICENSE.txt).