From dd8df201b3698f3a01f216cedbb1080725383c20 Mon Sep 17 00:00:00 2001
From: unknown <christopher.law89@gmail.com>
Date: Wed, 6 Jan 2021 12:52:05 +0800
Subject: [PATCH 01/10] feat: Return category info for word

Word entries on wiktionary are associated with various
categories. This commit adds the list of associated
categories to the returned json structure.
---
 readme.md                 | 33 +++++++++++++++++----------------
 wiktionaryparser/core.py  | 10 +++++++++-
 wiktionaryparser/utils.py |  2 +-
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/readme.md b/readme.md
index 794c3f0..d6a4057 100644
--- a/readme.md
+++ b/readme.md
@@ -7,22 +7,23 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 #### JSON structure
 
 ```json
-[{
-    "pronunciations": {
-        "text": ["pronunciation text"],
-        "audio": ["pronunciation audio"]
-    },
-    "definitions": [{
-        "relatedWords": [{
-            "relationshipType": "word relationship type",
-            "words": ["list of related words"]
-        }],
-        "text": ["list of definitions"],
-        "partOfSpeech": "part of speech",
-        "examples": ["list of examples"]
-    }],
-    "etymology": "etymology text",
-}]
+{"content": [{
+                "pronunciations": {
+                    "text": ["pronunciation text"],
+                    "audio": ["pronunciation audio"]
+                },
+                "definitions": [{
+                    "relatedWords": [{
+                        "relationshipType": "word relationship type",
+                        "words": ["list of related words"]
+                    }],
+                    "text": ["list of definitions"],
+                    "partOfSpeech": "part of speech",
+                    "examples": ["list of examples"]
+                }],
+                "etymology": "etymology text",
+            }]
+"categories": ["list of categories"]}
 ```
 
 #### Installation
diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 6497b23..2e80070 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -135,10 +135,18 @@ def get_word_data(self, language):
             'etymologies': self.parse_etymologies(word_contents),
             'related': self.parse_related_words(word_contents),
             'pronunciations': self.parse_pronunciations(word_contents),
+            'categories': self.parse_categories(),
         }
         json_obj_list = self.map_to_object(word_data)
         return json_obj_list
 
+    def parse_categories(self):
+        categories_list = []
+        catlinks = self.soup.find_all('div', {'class': 'catlinks'})
+        if len(catlinks) == 1:
+            categories_list = [cat.text for cat in catlinks[0].find_all('a')]
+        return categories_list
+
     def parse_pronunciations(self, word_contents):
         pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation')
         pronunciation_list = []
@@ -273,7 +281,7 @@ def map_to_object(self, word_data):
                             def_obj.related_words.append(RelatedWord(relation_type, related_words))
                     data_obj.definition_list.append(def_obj)
             json_obj_list.append(data_obj.to_json())
-        return json_obj_list
+        return {'content': json_obj_list, 'categories': word_data['categories'][1:]}
 
     def fetch(self, word, language=None, old_id=None):
         language = self.language if not language else language
diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py
index 4920620..858322b 100644
--- a/wiktionaryparser/utils.py
+++ b/wiktionaryparser/utils.py
@@ -76,4 +76,4 @@ def to_json(self):
         return {
             'relationshipType': self.relationship_type,
             'words': self.words
-        }
\ No newline at end of file
+        }

From e1297e19f49b4d9fa76c75d3d02583d2f77e6afa Mon Sep 17 00:00:00 2001
From: unknown <christopher.law89@gmail.com>
Date: Thu, 7 Jan 2021 12:28:46 +0800
Subject: [PATCH 02/10] feat: Fetch category page

The new function fetch_category returns the words
included under the provided category. Words are returned
in a list.
---
 wiktionaryparser/core.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 2e80070..33d1fee 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -283,6 +283,15 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return {'content': json_obj_list, 'categories': word_data['categories'][1:]}
 
+    def get_category_data(self):
+        # TODO: Add functionality for categories with multiple pages on wiktionary.
+        # TODO: Return subcategories
+        words = []
+        category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'})
+        if len(category_group) == 1:
+            words = [word.text for word in category_group[0].find_all('a')]
+        return words
+
     def fetch(self, word, language=None, old_id=None):
         language = self.language if not language else language
         response = self.session.get(self.url.format(word), params={'oldid': old_id})
@@ -290,3 +299,10 @@ def fetch(self, word, language=None, old_id=None):
         self.current_word = word
         self.clean_html()
         return self.get_word_data(language.lower())
+
+    def fetch_category(self, category):
+        category = "Category:" + category
+        response = self.session.get(self.url.format(category))
+        self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
+        self.clean_html()
+        return self.get_category_data()

From 332dd2250d8b029f3014bc4078cc044d2ff755b6 Mon Sep 17 00:00:00 2001
From: unknown <christopher.law89@gmail.com>
Date: Sat, 9 Jan 2021 20:49:01 +0800
Subject: [PATCH 03/10] fix: changed return structure to pass test

A return_categories option is added to the fetch
function defaulting to false; with this option set
to false, the fetch function will return the
original word information in json format.
If this option is set to true, the function will
return a pair of the word information and a list
of its categories. This change was made to make sure
the function passes the unit test.
---
 readme.md                 | 33 ++++++++++++++++-----------------
 wiktionaryparser/core.py  | 16 ++++++++++------
 wiktionaryparser/utils.py |  2 +-
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/readme.md b/readme.md
index d6a4057..794c3f0 100644
--- a/readme.md
+++ b/readme.md
@@ -7,23 +7,22 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 #### JSON structure
 
 ```json
-{"content": [{
-                "pronunciations": {
-                    "text": ["pronunciation text"],
-                    "audio": ["pronunciation audio"]
-                },
-                "definitions": [{
-                    "relatedWords": [{
-                        "relationshipType": "word relationship type",
-                        "words": ["list of related words"]
-                    }],
-                    "text": ["list of definitions"],
-                    "partOfSpeech": "part of speech",
-                    "examples": ["list of examples"]
-                }],
-                "etymology": "etymology text",
-            }]
-"categories": ["list of categories"]}
+[{
+    "pronunciations": {
+        "text": ["pronunciation text"],
+        "audio": ["pronunciation audio"]
+    },
+    "definitions": [{
+        "relatedWords": [{
+            "relationshipType": "word relationship type",
+            "words": ["list of related words"]
+        }],
+        "text": ["list of definitions"],
+        "partOfSpeech": "part of speech",
+        "examples": ["list of examples"]
+    }],
+    "etymology": "etymology text",
+}]
 ```
 
 #### Installation
diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 33d1fee..da1f349 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -107,7 +107,7 @@ def get_id_list(self, contents, content_type):
                 id_list.append((content_index, content_id, text_to_check))
         return id_list
 
-    def get_word_data(self, language):
+    def get_word_data(self, language, return_categories):
         contents = self.soup.find_all('span', {'class': 'toctext'})
         word_contents = []
         start_index = None
@@ -135,10 +135,14 @@ def get_word_data(self, language):
             'etymologies': self.parse_etymologies(word_contents),
             'related': self.parse_related_words(word_contents),
             'pronunciations': self.parse_pronunciations(word_contents),
-            'categories': self.parse_categories(),
+
         }
         json_obj_list = self.map_to_object(word_data)
-        return json_obj_list
+        if return_categories:
+            categories = self.parse_categories()
+            return json_obj_list, categories[1:]
+        else:
+            return json_obj_list
 
     def parse_categories(self):
         categories_list = []
@@ -281,7 +285,7 @@ def map_to_object(self, word_data):
                             def_obj.related_words.append(RelatedWord(relation_type, related_words))
                     data_obj.definition_list.append(def_obj)
             json_obj_list.append(data_obj.to_json())
-        return {'content': json_obj_list, 'categories': word_data['categories'][1:]}
+        return json_obj_list
 
     def get_category_data(self):
         # TODO: Add functionality for categories with multiple pages on wiktionary.
@@ -292,13 +296,13 @@ def get_category_data(self):
             words = [word.text for word in category_group[0].find_all('a')]
         return words
 
-    def fetch(self, word, language=None, old_id=None):
+    def fetch(self, word, language=None, old_id=None, return_categories=False):
         language = self.language if not language else language
         response = self.session.get(self.url.format(word), params={'oldid': old_id})
         self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
         self.current_word = word
         self.clean_html()
-        return self.get_word_data(language.lower())
+        return self.get_word_data(language.lower(), return_categories)
 
     def fetch_category(self, category):
         category = "Category:" + category
diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py
index 858322b..f445777 100644
--- a/wiktionaryparser/utils.py
+++ b/wiktionaryparser/utils.py
@@ -63,7 +63,7 @@ def to_json(self):
             'partOfSpeech': self.part_of_speech,
             'text': self.text,
             'relatedWords': [related_word.to_json() for related_word in self.related_words],
-            'examples': self.example_uses 
+            'examples': self.example_uses
         }
 
 

From 4d38a1ded313904acf72e46a5f70d0a30a56f174 Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 20:29:53 +0800
Subject: [PATCH 04/10] feat: add option to return subcategories

Category pages on wiktionary may have associated subcategories.
This commit adds the option to return these subcategories as
a list along with the category words. The function fetch_category
can now return a pair of lists (i.e., words and subcategories)
---
 wiktionaryparser/core.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index da1f349..069419f 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -135,7 +135,6 @@ def get_word_data(self, language, return_categories):
             'etymologies': self.parse_etymologies(word_contents),
             'related': self.parse_related_words(word_contents),
             'pronunciations': self.parse_pronunciations(word_contents),
-
         }
         json_obj_list = self.map_to_object(word_data)
         if return_categories:
@@ -287,14 +286,20 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return json_obj_list
 
-    def get_category_data(self):
+    def get_category_data(self, return_subcategories=False):
         # TODO: Add functionality for categories with multiple pages on wiktionary.
-        # TODO: Return subcategories
         words = []
         category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'})
         if len(category_group) == 1:
             words = [word.text for word in category_group[0].find_all('a')]
-        return words
+        if return_subcategories:
+            subcategories = []
+            category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'})
+            for category_group in category_groups:
+                subcategories += [cat.text for cat in category_group.find_all('a')]
+            return words, subcategories
+        else:
+            return words
 
     def fetch(self, word, language=None, old_id=None, return_categories=False):
         language = self.language if not language else language
@@ -304,9 +309,9 @@ def fetch(self, word, language=None, old_id=None, return_categories=False):
         self.clean_html()
         return self.get_word_data(language.lower(), return_categories)
 
-    def fetch_category(self, category):
+    def fetch_category(self, category, return_subcategories=False):
         category = "Category:" + category
         response = self.session.get(self.url.format(category))
         self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
         self.clean_html()
-        return self.get_category_data()
+        return self.get_category_data(return_subcategories)

From 2a53b05613514f9642781105a59a8f7e7f34f28e Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 20:35:22 +0800
Subject: [PATCH 05/10] style: word parsing consistency

Revised code for parsing words on a category page for consistency
with the approach for parsing subcategories.
---
 wiktionaryparser/core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 069419f..33b048a 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -289,9 +289,9 @@ def map_to_object(self, word_data):
     def get_category_data(self, return_subcategories=False):
         # TODO: Add functionality for categories with multiple pages on wiktionary.
         words = []
-        category_group = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category'})
-        if len(category_group) == 1:
-            words = [word.text for word in category_group[0].find_all('a')]
+        category_groups = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category-group'})
+        for category_group in category_groups:
+            words += [word.text for word in category_group.find_all('a')]
         if return_subcategories:
             subcategories = []
             category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'})

From eee2cc578af2158bfe52df8e96bd9b32aba1f1d8 Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 21:27:27 +0800
Subject: [PATCH 06/10] bug: categories split over multiple pages

Wiktionary limits category pages to 200 words per page.
This commit ensures that fetch_category returns all the
words by updating self.soup to the next page of words.
---
 wiktionaryparser/core.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 33b048a..8b6291c 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -286,12 +286,26 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return json_obj_list
 
+    def parse_next_page_links(self):
+        link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': 'Category:English phrasebook'})
+        return [link['href'] for link in link_tags if link.text == 'next page']
+
+    def parse_category_words(self):
+        words_content = self.soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'})
+        words = [word.text for word in words_content.find_all('a')]
+        return words
+
     def get_category_data(self, return_subcategories=False):
-        # TODO: Add functionality for categories with multiple pages on wiktionary.
         words = []
-        category_groups = self.soup.find('div', {'id': 'mw-pages'}).find_all('div', {'class': 'mw-category-group'})
-        for category_group in category_groups:
-            words += [word.text for word in category_group.find_all('a')]
+        next_page_links = self.parse_next_page_links()
+        while len(next_page_links) > 0:
+            words += self.parse_category_words()
+            response = self.session.get('https://en.wiktionary.org/' + next_page_links[0])
+            self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
+            self.clean_html()
+            next_page_links = self.parse_next_page_links()
+        words += self.parse_category_words()
+
         if return_subcategories:
             subcategories = []
             category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'})

From 83988d43f276a6fb4861a20aae6bee7292dbba2f Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 21:32:33 +0800
Subject: [PATCH 07/10] docs: Categories example

Updated readme to include examples using categories
---
 readme.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 794c3f0..b43f5ce 100644
--- a/readme.md
+++ b/readme.md
@@ -27,7 +27,7 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 
 #### Installation
 
-##### Using pip 
+##### Using pip
 * run `pip install wiktionaryparser`
 
 ##### From Source
@@ -55,6 +55,12 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 >>> parser.include_relation('alternative forms')
 ```
 
+```python
+>>> word, categories = parser.fetch('test', return_categories=True)
+>>> words = parser.fetch_category('English phrasebook')
+>>> words, subcategories = = parser.fetch_category('English phrasebook', return_subcategories=True)
+```
+
 #### Requirements
 
  - requests==2.20.0

From 0bbd0e1d59964821bf25aa5ee94ab63e672dff1f Mon Sep 17 00:00:00 2001
From: Chris Law <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 21:34:34 +0800
Subject: [PATCH 08/10] docs: Fix typos in readme

---
 readme.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index b43f5ce..c2c418b 100644
--- a/readme.md
+++ b/readme.md
@@ -58,7 +58,8 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 ```python
 >>> word, categories = parser.fetch('test', return_categories=True)
 >>> words = parser.fetch_category('English phrasebook')
->>> words, subcategories = = parser.fetch_category('English phrasebook', return_subcategories=True)
+>>> words, subcategories = parser.fetch_category('English phrasebook', 
+                                                  return_subcategories=True)
 ```
 
 #### Requirements

From 231fd3143e357734e787ce6cff4d2c0efa3adb91 Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Sun, 10 Jan 2021 21:41:40 +0800
Subject: [PATCH 09/10] format: reversed unintended format changes

---
 wiktionaryparser/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py
index f445777..858322b 100644
--- a/wiktionaryparser/utils.py
+++ b/wiktionaryparser/utils.py
@@ -63,7 +63,7 @@ def to_json(self):
             'partOfSpeech': self.part_of_speech,
             'text': self.text,
             'relatedWords': [related_word.to_json() for related_word in self.related_words],
-            'examples': self.example_uses
+            'examples': self.example_uses 
         }
 
 

From c68a0014b4434acc82b6d7f4bfe5b80a64ff9c50 Mon Sep 17 00:00:00 2001
From: claw89 <christopher.law89@gmail.com>
Date: Mon, 11 Jan 2021 10:49:42 +0800
Subject: [PATCH 10/10] bug: next pg only English phrasebook

This commit corrects the parser_next_page_links
function which was limited to Category:English_phrasebook.
The category name is now passed as an argument, so the
function is applicable to all categories.
---
 wiktionaryparser/core.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
index 8b6291c..10f5bab 100644
--- a/wiktionaryparser/core.py
+++ b/wiktionaryparser/core.py
@@ -286,8 +286,8 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return json_obj_list
 
-    def parse_next_page_links(self):
-        link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': 'Category:English phrasebook'})
+    def parse_next_page_links(self, category):
+        link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category})
         return [link['href'] for link in link_tags if link.text == 'next page']
 
     def parse_category_words(self):
@@ -295,15 +295,15 @@ def parse_category_words(self):
         words = [word.text for word in words_content.find_all('a')]
         return words
 
-    def get_category_data(self, return_subcategories=False):
+    def get_category_data(self, category, return_subcategories=False):
         words = []
-        next_page_links = self.parse_next_page_links()
+        next_page_links = self.parse_next_page_links(category)
         while len(next_page_links) > 0:
             words += self.parse_category_words()
             response = self.session.get('https://en.wiktionary.org/' + next_page_links[0])
             self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
             self.clean_html()
-            next_page_links = self.parse_next_page_links()
+            next_page_links = self.parse_next_page_links(category)
         words += self.parse_category_words()
 
         if return_subcategories:
@@ -328,4 +328,4 @@ def fetch_category(self, category, return_subcategories=False):
         response = self.session.get(self.url.format(category))
         self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
         self.clean_html()
-        return self.get_category_data(return_subcategories)
+        return self.get_category_data(category, return_subcategories=return_subcategories)