-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLDA_and_Federalists
38 lines (29 loc) · 1.05 KB
/
LDA_and_Federalists
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
import lda
import nltk
def read_files_into_string(filenames):
strings= []
for filename in filenames:
with open(f'federalist_{filename}.txt') as f:
strings.append(f.read())
return '\n'.join(strings)
federalist_by_author = {}
for author, files in papers.items():
federalist_by_author[author] = read_files_into_string(files)
text= str(federalist_by_author)
###or text = str(federalist_by_author["Madison"])
document_split = text.split('\n\n\n\n\n')
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document_split)
vocabulary = vectorizer.get_feature_names()
import lda
model = lda.LDA(n_topics = 10, n_iter=1000, random_state=1)
model.fit(X)
topic_word = model.topic_word_
n_top_words=50
for i, topic_distribution in enumerate(topic_word):
topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
print('topic {}: {}'.format(i, ' '.join(topic_words)))
print()
authors = ("Hamilton", "Madison", "Disputed", "Jay", "Shared")