Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified Rake Object #42

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 37 additions & 53 deletions rake_nltk/rake.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Rake(object):

def __init__(
self,
text,
stopwords=None,
punctuations=None,
language="english",
Expand All @@ -44,11 +45,12 @@ def __init__(
:param min_length: Minimum limit on the number of words in a phrase
(Inclusive. Defaults to 1)
"""

# By default use degree to frequency ratio as the metric.
if isinstance(ranking_metric, Metric):
self.metric = ranking_metric
self.__metric = ranking_metric
else:
self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
self.__metric = Metric.DEGREE_TO_FREQUENCY_RATIO

# If stopwords not provided we use language stopwords by default.
self.stopwords = stopwords
Expand All @@ -61,27 +63,41 @@ def __init__(
self.punctuations = string.punctuation

# All things which act as sentence breaks during keyword extraction.
self.to_ignore = set(chain(self.stopwords, self.punctuations))
self.__to_ignore = set(chain(self.stopwords, self.punctuations))

# Assign min or max length to the attributes
self.min_length = min_length
self.max_length = max_length
self.__min_length = min_length
self.__max_length = max_length

# Stuff to be extracted from the provided text.
self.frequency_dist = None
self.degree = None
self.word_degrees = None
self.rank_list = None
self.ranked_phrases = None

def extract_keywords_from_text(self, text):
"""Method to extract keywords from the text provided.
# Initializing the text and building all the fields
self.set_text(text)

:param text: Text to extract keywords from, provided as a string.
"""
# You don't need all of the getter methods, you just need to call these fields off the Rake object
# Fields to call:
# - self.ranked_phrases
# - self.rank_list
# - self.frequency_dist
# - self.word_degrees

def set_text(self, text):
self.text = text
sentences = nltk.tokenize.sent_tokenize(text)
self.extract_keywords_from_sentences(sentences)
self._extract_keywords_from_sentences(sentences)

def set_stopwords(self, stopwords):
self.stopwords = stopwords
self.set_text(self.text)

def extract_keywords_from_sentences(self, sentences):
def set_punctuations(self, punctuations):
self.punctuations = punctuations
self.set_text(self.text)

def _extract_keywords_from_sentences(self, sentences):
"""Method to extract keywords from the list of sentences provided.

:param sentences: Text to extraxt keywords from, provided as a list
Expand All @@ -92,38 +108,6 @@ def extract_keywords_from_sentences(self, sentences):
self._build_word_co_occurance_graph(phrase_list)
self._build_ranklist(phrase_list)

def get_ranked_phrases(self):
"""Method to fetch ranked keyword strings.

:return: List of strings where each string represents an extracted
keyword string.
"""
return self.ranked_phrases

def get_ranked_phrases_with_scores(self):
"""Method to fetch ranked keyword strings along with their scores.

:return: List of tuples where each tuple is formed of an extracted
keyword string and its score. Ex: (5.68, 'Four Scoures')
"""
return self.rank_list

def get_word_frequency_distribution(self):
"""Method to fetch the word frequency distribution in the given text.

:return: Dictionary (defaultdict) of the format `word -> frequency`.
"""
return self.frequency_dist

def get_word_degrees(self):
"""Method to fetch the degree of words in the given text. Degree can be
defined as sum of co-occurances of the word with other words in the
given text.

:return: Dictionary (defaultdict) of the format `word -> degree`.
"""
return self.degree

def _build_frequency_dist(self, phrase_list):
"""Builds frequency distribution of the words in the given body of text.

Expand All @@ -148,9 +132,9 @@ def _build_word_co_occurance_graph(self, phrase_list):
# use in other creative ways if required later.
for (word, coword) in product(phrase, phrase):
co_occurance_graph[word][coword] += 1
self.degree = defaultdict(lambda: 0)
self.word_degrees = defaultdict(lambda: 0)
for key in co_occurance_graph:
self.degree[key] = sum(co_occurance_graph[key].values())
self.word_degrees[key] = sum(co_occurance_graph[key].values())

def _build_ranklist(self, phrase_list):
"""Method to rank each contender phrase using the formula
Expand All @@ -165,10 +149,10 @@ def _build_ranklist(self, phrase_list):
for phrase in phrase_list:
rank = 0.0
for word in phrase:
if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
rank += 1.0 * self.degree[word] / self.frequency_dist[word]
elif self.metric == Metric.WORD_DEGREE:
rank += 1.0 * self.degree[word]
if self.__metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
rank += 1.0 * self.word_degrees[word] / self.frequency_dist[word]
elif self.__metric == Metric.WORD_DEGREE:
rank += 1.0 * self.word_degrees[word]
else:
rank += 1.0 * self.frequency_dist[word]
self.rank_list.append((rank, " ".join(phrase)))
Expand Down Expand Up @@ -213,10 +197,10 @@ def _get_phrase_list_from_words(self, word_list):
:return: List of contender phrases that are formed after dropping
stopwords and punctuations.
"""
groups = groupby(word_list, lambda x: x not in self.to_ignore)
groups = groupby(word_list, lambda x: x not in self.__to_ignore)
phrases = [tuple(group[1]) for group in groups if group[0]]
return list(
filter(
lambda x: self.min_length <= len(x) <= self.max_length, phrases
lambda x: self.__min_length <= len(x) <= self.__max_length, phrases
)
)