From dd8df201b3698f3a01f216cedbb1080725383c20 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 6 Jan 2021 12:52:05 +0800 Subject: [PATCH 01/10] feat: Return category info for word Word entries on wiktionary are associated with various categories. This commit adds the list of associated categories to the returned json structure. --- readme.md | 33 +++++++++++++++++---------------- wiktionaryparser/core.py | 10 +++++++++- wiktionaryparser/utils.py | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/readme.md b/readme.md index 794c3f0..d6a4057 100644 --- a/readme.md +++ b/readme.md @@ -7,22 +7,23 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o #### JSON structure ```json -[{ - "pronunciations": { - "text": ["pronunciation text"], - "audio": ["pronunciation audio"] - }, - "definitions": [{ - "relatedWords": [{ - "relationshipType": "word relationship type", - "words": ["list of related words"] - }], - "text": ["list of definitions"], - "partOfSpeech": "part of speech", - "examples": ["list of examples"] - }], - "etymology": "etymology text", -}] +{"content": [{ + "pronunciations": { + "text": ["pronunciation text"], + "audio": ["pronunciation audio"] + }, + "definitions": [{ + "relatedWords": [{ + "relationshipType": "word relationship type", + "words": ["list of related words"] + }], + "text": ["list of definitions"], + "partOfSpeech": "part of speech", + "examples": ["list of examples"] + }], + "etymology": "etymology text", + }] +"categories": ["list of categories"]} ``` #### Installation diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 6497b23..2e80070 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -135,10 +135,18 @@ def get_word_data(self, language): 'etymologies': self.parse_etymologies(word_contents), 'related': self.parse_related_words(word_contents), 'pronunciations': self.parse_pronunciations(word_contents), + 'categories': self.parse_categories(), } json_obj_list = self.map_to_object(word_data) return json_obj_list + def parse_categories(self): + categories_list = [] + catlinks = self.soup.find_all('div', {'class': 'catlinks'}) + if len(catlinks) == 1: + categories_list = [cat.text for cat in catlinks[0].find_all('a')] + return categories_list + def parse_pronunciations(self, word_contents): pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation') pronunciation_list = [] @@ -273,7 +281,7 @@ def map_to_object(self, word_data): def_obj.related_words.append(RelatedWord(relation_type, related_words)) data_obj.definition_list.append(def_obj) json_obj_list.append(data_obj.to_json()) - return json_obj_list + return {'content': json_obj_list, 'categories': word_data['categories'][1:]} def fetch(self, word, language=None, old_id=None): language = self.language if not language else language diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py index 4920620..858322b 100644 --- a/wiktionaryparser/utils.py +++ b/wiktionaryparser/utils.py @@ -76,4 +76,4 @@ def to_json(self): return { 'relationshipType': self.relationship_type, 'words': self.words - } \ No newline at end of file + } From e1297e19f49b4d9fa76c75d3d02583d2f77e6afa Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 7 Jan 2021 12:28:46 +0800 Subject: [PATCH 02/10] feat: Fetch category page The new function fetch_category returns the words included under the provided category. Words are returned in a list. --- wiktionaryparser/core.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 2e80070..33d1fee 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -283,6 +283,15 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return {'content': json_obj_list, 'categories': word_data['categories'][1:]} + def get_category_data(self): + # TODO: Add functionality for categories with multiple pages on wiktionary. + # TODO: Return subcategories + words = [] + category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'}) + if len(category_group) == 1: + words = [word.text for word in category_group[0].find_all('a')] + return words + def fetch(self, word, language=None, old_id=None): language = self.language if not language else language response = self.session.get(self.url.format(word), params={'oldid': old_id}) @@ -290,3 +299,10 @@ def fetch(self, word, language=None, old_id=None): self.current_word = word self.clean_html() return self.get_word_data(language.lower()) + + def fetch_category(self, category): + category = "Category:" + category + response = self.session.get(self.url.format(category)) + self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') + self.clean_html() + return self.get_category_data() From 332dd2250d8b029f3014bc4078cc044d2ff755b6 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 9 Jan 2021 20:49:01 +0800 Subject: [PATCH 03/10] fix: changed return structure to pass test A return_categories option is added to the fetch function defaulting to false; with this option set to false, the fetch function will return the original word information in json format. If this option is set to true, the function will return a pair of the word information and a list of its categories. This change was made to make sure the function passes the unit test. --- readme.md | 33 ++++++++++++++++----------------- wiktionaryparser/core.py | 16 ++++++++++------ wiktionaryparser/utils.py | 2 +- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/readme.md b/readme.md index d6a4057..794c3f0 100644 --- a/readme.md +++ b/readme.md @@ -7,23 +7,22 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o #### JSON structure ```json -{"content": [{ - "pronunciations": { - "text": ["pronunciation text"], - "audio": ["pronunciation audio"] - }, - "definitions": [{ - "relatedWords": [{ - "relationshipType": "word relationship type", - "words": ["list of related words"] - }], - "text": ["list of definitions"], - "partOfSpeech": "part of speech", - "examples": ["list of examples"] - }], - "etymology": "etymology text", - }] -"categories": ["list of categories"]} +[{ + "pronunciations": { + "text": ["pronunciation text"], + "audio": ["pronunciation audio"] + }, + "definitions": [{ + "relatedWords": [{ + "relationshipType": "word relationship type", + "words": ["list of related words"] + }], + "text": ["list of definitions"], + "partOfSpeech": "part of speech", + "examples": ["list of examples"] + }], + "etymology": "etymology text", +}] ``` #### Installation diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 33d1fee..da1f349 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -107,7 +107,7 @@ def get_id_list(self, contents, content_type): id_list.append((content_index, content_id, text_to_check)) return id_list - def get_word_data(self, language): + def get_word_data(self, language, return_categories): contents = self.soup.find_all('span', {'class': 'toctext'}) word_contents = [] start_index = None @@ -135,10 +135,14 @@ def get_word_data(self, language): 'etymologies': self.parse_etymologies(word_contents), 'related': self.parse_related_words(word_contents), 'pronunciations': self.parse_pronunciations(word_contents), - 'categories': self.parse_categories(), + } json_obj_list = self.map_to_object(word_data) - return json_obj_list + if return_categories: + categories = self.parse_categories() + return json_obj_list, categories[1:] + else: + return json_obj_list def parse_categories(self): categories_list = [] @@ -281,7 +285,7 @@ def map_to_object(self, word_data): def_obj.related_words.append(RelatedWord(relation_type, related_words)) data_obj.definition_list.append(def_obj) json_obj_list.append(data_obj.to_json()) - return {'content': json_obj_list, 'categories': word_data['categories'][1:]} + return json_obj_list def get_category_data(self): # TODO: Add functionality for categories with multiple pages on wiktionary. @@ -292,13 +296,13 @@ def get_category_data(self): words = [word.text for word in category_group[0].find_all('a')] return words - def fetch(self, word, language=None, old_id=None): + def fetch(self, word, language=None, old_id=None, return_categories=False): language = self.language if not language else language response = self.session.get(self.url.format(word), params={'oldid': old_id}) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.current_word = word self.clean_html() - return self.get_word_data(language.lower()) + return self.get_word_data(language.lower(), return_categories) def fetch_category(self, category): category = "Category:" + category diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py index 858322b..f445777 100644 --- a/wiktionaryparser/utils.py +++ b/wiktionaryparser/utils.py @@ -63,7 +63,7 @@ def to_json(self): 'partOfSpeech': self.part_of_speech, 'text': self.text, 'relatedWords': [related_word.to_json() for related_word in self.related_words], - 'examples': self.example_uses + 'examples': self.example_uses } From 4d38a1ded313904acf72e46a5f70d0a30a56f174 Mon Sep 17 00:00:00 2001 From: claw89 Date: Sun, 10 Jan 2021 20:29:53 +0800 Subject: [PATCH 04/10] feat: add option to return subcategories Category pages on wiktionary may have associated subcategories. This commit adds the option to return these subcategories as a list along with the category words. The function fetch_category can now return a pair of lists (i.e., words and subcategories) --- wiktionaryparser/core.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index da1f349..069419f 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -135,7 +135,6 @@ def get_word_data(self, language, return_categories): 'etymologies': self.parse_etymologies(word_contents), 'related': self.parse_related_words(word_contents), 'pronunciations': self.parse_pronunciations(word_contents), - } json_obj_list = self.map_to_object(word_data) if return_categories: @@ -287,14 +286,20 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return json_obj_list - def get_category_data(self): + def get_category_data(self, return_subcategories=False): # TODO: Add functionality for categories with multiple pages on wiktionary. - # TODO: Return subcategories words = [] category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'}) if len(category_group) == 1: words = [word.text for word in category_group[0].find_all('a')] - return words + if return_subcategories: + subcategories = [] + category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'}) + for category_group in category_groups: + subcategories += [cat.text for cat in category_group.find_all('a')] + return words, subcategories + else: + return words def fetch(self, word, language=None, old_id=None, return_categories=False): language = self.language if not language else language @@ -304,9 +309,9 @@ def fetch(self, word, language=None, old_id=None, return_categories=False): self.clean_html() return self.get_word_data(language.lower(), return_categories) - def fetch_category(self, category): + def fetch_category(self, category, return_subcategories=False): category = "Category:" + category response = self.session.get(self.url.format(category)) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.clean_html() - return self.get_category_data() + return self.get_category_data(return_subcategories) From 2a53b05613514f9642781105a59a8f7e7f34f28e Mon Sep 17 00:00:00 2001 From: claw89 Date: Sun, 10 Jan 2021 20:35:22 +0800 Subject: [PATCH 05/10] style: word parsing consistency Revised code for parsing words on a category page for consistency with the approach for parsing subcategories. --- wiktionaryparser/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 069419f..33b048a 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -289,9 +289,9 @@ def map_to_object(self, word_data): def get_category_data(self, return_subcategories=False): # TODO: Add functionality for categories with multiple pages on wiktionary. words = [] - category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'}) - if len(category_group) == 1: - words = [word.text for word in category_group[0].find_all('a')] + category_groups = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category-group'}) + for category_group in category_groups: + words += [word.text for word in category_group.find_all('a')] if return_subcategories: subcategories = [] category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'}) From eee2cc578af2158bfe52df8e96bd9b32aba1f1d8 Mon Sep 17 00:00:00 2001 From: claw89 Date: Sun, 10 Jan 2021 21:27:27 +0800 Subject: [PATCH 06/10] bug: categories split over multiple pages Wiktionary limits category pages to 200 words per page. This commit ensures that fetch_category returns all the words by updating self.soup to the next page of words. --- wiktionaryparser/core.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 33b048a..8b6291c 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -286,12 +286,26 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return json_obj_list + def parse_next_page_links(self): + link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': 'Category:English phrasebook'}) + return [link['href'] for link in link_tags if link.text == 'next page'] + + def parse_category_words(self): + words_content = self.soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'}) + words = [word.text for word in words_content.find_all('a')] + return words + def get_category_data(self, return_subcategories=False): - # TODO: Add functionality for categories with multiple pages on wiktionary. words = [] - category_groups = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category-group'}) - for category_group in category_groups: - words += [word.text for word in category_group.find_all('a')] + next_page_links = self.parse_next_page_links() + while len(next_page_links) > 0: + words += self.parse_category_words() + response = self.session.get('https://en.wiktionary.org/' + next_page_links[0]) + self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') + self.clean_html() + next_page_links = self.parse_next_page_links() + words += self.parse_category_words() + if return_subcategories: subcategories = [] category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'}) From 83988d43f276a6fb4861a20aae6bee7292dbba2f Mon Sep 17 00:00:00 2001 From: claw89 Date: Sun, 10 Jan 2021 21:32:33 +0800 Subject: [PATCH 07/10] docs: Categories example Updated readme to include examples using categories --- readme.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 794c3f0..b43f5ce 100644 --- a/readme.md +++ b/readme.md @@ -27,7 +27,7 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o #### Installation -##### Using pip +##### Using pip * run `pip install wiktionaryparser` ##### From Source @@ -55,6 +55,12 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o >>> parser.include_relation('alternative forms') ``` +```python +>>> word, categories = parser.fetch('test', return_categories=True) +>>> words = parser.fetch_category('English phrasebook') +>>> words, subcategories = = parser.fetch_category('English phrasebook', return_subcategories=True) +``` + #### Requirements - requests==2.20.0 From 0bbd0e1d59964821bf25aa5ee94ab63e672dff1f Mon Sep 17 00:00:00 2001 From: Chris Law Date: Sun, 10 Jan 2021 21:34:34 +0800 Subject: [PATCH 08/10] docs: Fix typos in readme --- readme.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index b43f5ce..c2c418b 100644 --- a/readme.md +++ b/readme.md @@ -58,7 +58,8 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o ```python >>> word, categories = parser.fetch('test', return_categories=True) >>> words = parser.fetch_category('English phrasebook') ->>> words, subcategories = = parser.fetch_category('English phrasebook', return_subcategories=True) +>>> words, subcategories = parser.fetch_category('English phrasebook', + return_subcategories=True) ``` #### Requirements From 231fd3143e357734e787ce6cff4d2c0efa3adb91 Mon Sep 17 00:00:00 2001 From: claw89 Date: Sun, 10 Jan 2021 21:41:40 +0800 Subject: [PATCH 09/10] format: reversed unintended format changes --- wiktionaryparser/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py index f445777..858322b 100644 --- a/wiktionaryparser/utils.py +++ b/wiktionaryparser/utils.py @@ -63,7 +63,7 @@ def to_json(self): 'partOfSpeech': self.part_of_speech, 'text': self.text, 'relatedWords': [related_word.to_json() for related_word in self.related_words], - 'examples': self.example_uses + 'examples': self.example_uses } From c68a0014b4434acc82b6d7f4bfe5b80a64ff9c50 Mon Sep 17 00:00:00 2001 From: claw89 Date: Mon, 11 Jan 2021 10:49:42 +0800 Subject: [PATCH 10/10] bug: next pg only English phrasebook This commit corrects the parser_next_page_links function which was limited to Category:English_phrasebook. The category name is now passed as an argument, so the function is applicable to all categories. --- wiktionaryparser/core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 8b6291c..10f5bab 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -286,8 +286,8 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return json_obj_list - def parse_next_page_links(self): - link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': 'Category:English phrasebook'}) + def parse_next_page_links(self, category): + link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category}) return [link['href'] for link in link_tags if link.text == 'next page'] def parse_category_words(self): @@ -295,15 +295,15 @@ def parse_category_words(self): words = [word.text for word in words_content.find_all('a')] return words - def get_category_data(self, return_subcategories=False): + def get_category_data(self, category, return_subcategories=False): words = [] - next_page_links = self.parse_next_page_links() + next_page_links = self.parse_next_page_links(category) while len(next_page_links) > 0: words += self.parse_category_words() response = self.session.get('https://en.wiktionary.org/' + next_page_links[0]) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.clean_html() - next_page_links = self.parse_next_page_links() + next_page_links = self.parse_next_page_links(category) words += self.parse_category_words() if return_subcategories: @@ -328,4 +328,4 @@ def fetch_category(self, category, return_subcategories=False): response = self.session.get(self.url.format(category)) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.clean_html() - return self.get_category_data(return_subcategories) + return self.get_category_data(category, return_subcategories=return_subcategories)