suyashb95 · claw89 · Jan 6, 2021 · Jan 7, 2021 · Jan 9, 2021 · Jan 10, 2021
diff --git a/readme.md b/readme.md
@@ -27,7 +27,7 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 
 #### Installation
 
-##### Using pip 
+##### Using pip
 * run `pip install wiktionaryparser`
 
 ##### From Source
@@ -55,6 +55,13 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o
 >>> parser.include_relation('alternative forms')
 ```
 
+```python
+>>> word, categories = parser.fetch('test', return_categories=True)
+>>> words = parser.fetch_category('English phrasebook')
+>>> words, subcategories = parser.fetch_category('English phrasebook', 
+                                                  return_subcategories=True)
+```
+
 #### Requirements
 
  - requests==2.20.0

diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
@@ -107,7 +107,7 @@ def get_id_list(self, contents, content_type):
                 id_list.append((content_index, content_id, text_to_check))
         return id_list
 
-    def get_word_data(self, language):
+    def get_word_data(self, language, return_categories):
         contents = self.soup.find_all('span', {'class': 'toctext'})
         word_contents = []
         start_index = None
@@ -137,7 +137,18 @@ def get_word_data(self, language):
             'pronunciations': self.parse_pronunciations(word_contents),
         }
         json_obj_list = self.map_to_object(word_data)
-        return json_obj_list
+        if return_categories:
+            categories = self.parse_categories()
+            return json_obj_list, categories[1:]
+        else:
+            return json_obj_list
+
+    def parse_categories(self):
+        categories_list = []
+        catlinks = self.soup.find_all('div', {'class': 'catlinks'})
+        if len(catlinks) == 1:
+            categories_list = [cat.text for cat in catlinks[0].find_all('a')]
+        return categories_list
 
     def parse_pronunciations(self, word_contents):
         pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation')
@@ -275,10 +286,46 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return json_obj_list
 
-    def fetch(self, word, language=None, old_id=None):
+    def parse_next_page_links(self, category):
+        link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category})
+        return [link['href'] for link in link_tags if link.text == 'next page']
+
+    def parse_category_words(self):
+        words_content = self.soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'})
+        words = [word.text for word in words_content.find_all('a')]
+        return words
+
+    def get_category_data(self, category, return_subcategories=False):
+        words = []
+        next_page_links = self.parse_next_page_links(category)
+        while len(next_page_links) > 0:
+            words += self.parse_category_words()
+            response = self.session.get('https://en.wiktionary.org/' + next_page_links[0])
+            self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
+            self.clean_html()
+            next_page_links = self.parse_next_page_links(category)
+        words += self.parse_category_words()
+
+        if return_subcategories:
+            subcategories = []
+            category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'})
+            for category_group in category_groups:
+                subcategories += [cat.text for cat in category_group.find_all('a')]
+            return words, subcategories
+        else:
+            return words
+
+    def fetch(self, word, language=None, old_id=None, return_categories=False):
         language = self.language if not language else language
         response = self.session.get(self.url.format(word), params={'oldid': old_id})
         self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
         self.current_word = word
         self.clean_html()
-        return self.get_word_data(language.lower())
+        return self.get_word_data(language.lower(), return_categories)
+
+    def fetch_category(self, category, return_subcategories=False):
+        category = "Category:" + category
+        response = self.session.get(self.url.format(category))
+        self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
+        self.clean_html()
+        return self.get_category_data(category, return_subcategories=return_subcategories)
diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py
@@ -76,4 +76,4 @@ def to_json(self):
         return {
             'relationshipType': self.relationship_type,
             'words': self.words
-        }
+        }