-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextrank.py
159 lines (124 loc) · 6.58 KB
/
textrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import sys
import pandas as pd
from collections import defaultdict
#TextRank is an unsupervised algorithm used for keyword extraction in natural language processing. It is based on the concept of PageRank, which is widely used in web search engines. TextRank treats words or phrases as nodes in a graph and establishes edges based on co-occurrence within a context window. The algorithm iteratively updates node weights until convergence, and the final weights indicate the importance of words or phrases. Keywords are then extracted based on the highest weights.
#Nodes: TextRank, unsupervised, algorithm, keyword, extraction, natural language processing, PageRank, concept, web search engines, words, phrases, graph, edges, co-occurrence, context window, iteratively, updates, node weights, convergence, final weights, importance.
#Edges: (TextRank, unsupervised), (TextRank, algorithm), (algorithm, keyword), ...
#TextRank: 0.15, unsupervised: 0.12, algorithm: 0.20, keyword: 0.18, extraction: 0.16, ...
#algorithm, TextRank, keyword, extraction, node weights.
#new_weight(node) = (1 - d) + d * (sum(weight(neighbor) / out_degree(neighbor)) for neighbor in neighbors)
#The update is based on the weights of neighboring nodes and the number of outgoing edges from each node.
class TextrankGraph:
'''textrank graph'''
def __init__(self):
self.graph = defaultdict(list)
self.d = 0.85 # damping coefficient, usually is .85
self.min_diff = 1e-5 # convergence threshold
self.steps = 1000 # iteration steps
self.weight_updates = defaultdict(list)
def addEdge(self, start, end, weight):
"""Add edge between node"""
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
def rank_2(self,graph):
weight_default = 1.0 / (len(graph) or 1.0)
nodeweight_dict = defaultdict(float)
outsum_node_dict = defaultdict(float)
for node, out_edge in graph.items():
nodeweight_dict[node] = weight_default
outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)
sorted_keys = sorted(graph.keys())
step_dict = [0]
weight_updates_df = pd.DataFrame()
for step in range(1, self.steps):
for node in sorted_keys:
s = 0
for e in graph[node]:
s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
new_weight = (1 - self.d) + self.d * s
nodeweight_dict[node] = new_weight
self.weight_updates[node].append(new_weight)
step_dict.append(sum(nodeweight_dict.values()))
if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
break
weight_updates_df = pd.concat([weight_updates_df, pd.DataFrame(nodeweight_dict, index=[step])])
min_rank, max_rank = 0, 0
for w in nodeweight_dict.values():
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
for n, w in nodeweight_dict.items():
nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
# result_weights = {node: nodeweight_dict[node] for node in nodes_list}
return nodeweight_dict
def rank(self):
weight_default = 1.0 / (len(self.graph) or 1.0)
nodeweight_dict = defaultdict(float)
outsum_node_dict = defaultdict(float)
for node, out_edge in self.graph.items():
nodeweight_dict[node] = weight_default
outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)
sorted_keys = sorted(self.graph.keys())
step_dict = [0]
weight_updates_df = pd.DataFrame() # Initialize an empty DataFrame
for step in range(1, self.steps):
for node in sorted_keys:
s = 0
for e in self.graph[node]:
s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
new_weight = (1 - self.d) + self.d * s
nodeweight_dict[node] = new_weight
self.weight_updates[node].append(new_weight)
step_dict.append(sum(nodeweight_dict.values()))
if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
break
# Append the current nodeweight_dict to the DataFrame
weight_updates_df = pd.concat([weight_updates_df, pd.DataFrame(nodeweight_dict, index=[step])])
min_rank, max_rank = 0, 0
for w in nodeweight_dict.values():
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
for n, w in nodeweight_dict.items():
nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
# print(nodeweight_dict)
# print(weight_updates_df)
# print(nodeweight_dict)
return nodeweight_dict
class TextRank:
"""Extract keywords based on textrank graph algorithm"""
def __init__(self):
self.candi_pos = ['NOUN', 'PROPN', 'VERB']
self.stop_pos = ['NUM', 'ADV']
self.span = 10
def extract_keywords(self, word_list, num_keywords):
# print(word_list,num_keywords)
g = TextrankGraph()
cm = defaultdict(int)
#The pairs are created as long as the part-of-speech
# tag of the subsequent word is in the allowed list (self.candi_pos)
for i, word in enumerate(word_list): # word_list = [['previous', 'ADJ'], ['rumor', 'NOUN']]
if word[1] in self.candi_pos and len(word[0]) > 1: # word = ['previous', 'ADJ']
for j in range(i + 1, i + self.span):
if j >= len(word_list):
break
if word_list[j][1] not in self.candi_pos or word_list[j][1] in self.stop_pos or len(word_list[j][0]) < 2:
continue
# print(word[0],word_list[j][0])
pair = tuple((word[0], word_list[j][0]))
cm[(pair)] += 1
# cm = {('was', 'prison'): 1, ('become', 'prison'): 1}
# print("++++",self.candi_pos)
# print(word_list)
# print(cm)
for terms, w in cm.items():
# print(terms[0],terms[1],w)
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
print(nodes_rank)
nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True)
# print(nodes_rank)
# print(nodes_rank[:num_keywords])
return nodes_rank[:num_keywords]