-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebcrawler.py
253 lines (200 loc) · 8.71 KB
/
webcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from bs4 import BeautifulSoup
import requests
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
import os
# Create directories for this program if they don't exist
def create_directories():
directories = ["data", "cleaned_data", "knowledge_base"]
for directory in directories:
if not os.path.exists(directory):
os.makedirs(directory)
# Build searchable knowledge base
def build_knowledge_base(terms, num_files):
# Initialize dictionaries for each term
term_dict = {term: [] for term in terms}
# Iterate over each cleaned text file
for i in range(1, num_files):
filename = f"data/{i}.txt"
# Read the cleaned text file
with open(filename, 'r', encoding='utf-8') as f:
sentences = f.readlines()
# Iterate over each sentence in the text file
for sentence in sentences:
# Convert the sentence to lowercase
sentence = sentence.lower()
# Check if the sentence contains any of the terms
for term in terms:
if term in sentence:
# If the term is found, store the sentence in the corresponding term dictionary
term_dict[term].append(sentence)
# Write the sentences containing each term to separate files
for term, sentences in term_dict.items():
with open(f"knowledge_base/{term}.txt", 'w', encoding='utf-8') as f:
for sentence in sentences:
f.write(sentence + '\n')
# Save term_dict as a pickle file
with open("knowledge_base.pickle", "wb") as f:
pickle.dump(term_dict, f)
# Extract at least 25 important terms from the cleaned-up files
def extract_important_terms(files_path):
# Load cleaned texts
documents = []
for file_path in files_path:
with open(file_path, 'r', encoding='utf-8') as f:
documents.append(f.read())
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents
matrix = vectorizer.fit_transform(documents)
# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()
# Get the TF-IDF scores for each term in the documents
tfidf_scores = matrix.toarray()
# Get the average TF-IDF scores for each term
avg_tfidf_scores = tfidf_scores.mean(axis=0)
# Combine feature names and average TF-IDF scores
term_scores = list(zip(feature_names, avg_tfidf_scores))
# Sort the terms based on TF-IDF scores (from high to low)
term_scores.sort(key=lambda x: x[1], reverse=True)
# Output the top 40 important terms
num_terms = 40
top_terms = term_scores[:num_terms]
return top_terms
# Clean up the text files
def clean_text(text):
# Tokenize the text
words = word_tokenize(text)
# Remove punctuation
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in words]
# Remove non-alpha and non-digit words
words = [word for word in stripped if word.isalpha() or word.isdigit()]
# Convert to lowercase
words = [word.lower() for word in words]
# Remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Join the words back into a string
cleaned_text = ' '.join(words)
return cleaned_text
# Read in each raw file and clean it up as much as possible with NLP techniques. If you have x files in, you will have x files out.
def clean_text_files(counter):
for i in range(1, counter):
# Load the text from the file
filename = f"data/{i}.txt"
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
cleaned_text = clean_text(text)
print(f"{i}.txt is cleaned")
# Write the cleaned text to the cleaned file
cleaned_filename = f"cleaned_data/{i}_cleaned.txt"
with open(cleaned_filename, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
print(f"Write cleaned file to {i}_cleaned.txt")
# Scrape text off each page and write it to a file. Write each URL’s text to a separate file.
def save_url_text(url, index):
res = requests.get(url)
data = res.text
soup = BeautifulSoup(data, features="html.parser")
# Find all <p> tags
p_tags = soup.find_all('p')
# Extract the text content from each <p> tag and concatenate them together.
text_content = [p.get_text() for p in p_tags]
text_content = '\n'.join(text_content)
sentences = sent_tokenize(text_content)
sentences = [re.sub(r'\[\d+\]', '', sentence) for sentence in sentences]
sentences = [sentence.strip() for sentence in sentences]
# Write the text content into the text file
filename = f"{index}.txt"
with open(f'data/{filename}', 'w', encoding='utf-8') as f:
for sentence in sentences:
if sentence.endswith('.'):
f.write(sentence + '\n')
print(f"Write into {index}.txt")
def main():
# Step 1: Start with 1-3 URLs that represent a topic (a sport, a celebrity, a place, etc.)
starter_urls = [
"https://en.wikipedia.org/wiki/Michael_Jordan",
"https://www.biography.com/athlete/michael-jordan",
"https://www.nba.com/history/legends/profiles/michael-jordan"
]
# Create directories will be used later
create_directories()
# Put starter_urls into txt first
with open('urls.txt', 'w') as f:
for url in starter_urls:
f.write(str(url) + '\n')
# Crawl to find 15 – 25 (or more) relevant URLs
wiki_counter = 0
counter = 3 # count the number of urls. Since the starter_urls are counted, start from 3.
with open('urls.txt', 'a') as f:
for url in starter_urls:
res = requests.get(url)
data = res.text
soup = BeautifulSoup(data, features="html.parser")
# Iterate over all <a> tags in the HTML content
for link in soup.find_all('a'):
link_str = str(link.get('href'))
# Skip URLs containing 'wiki' after 15 wiki URLs have been encountered, or URLs ending with '.pdf'
if 'wiki' in link_str and wiki_counter > 15 or link_str.endswith('.pdf'):
continue
# Remove the '/url?q=' prefix from the URL string
if link_str.startswith('/url?q='):
link_str = link_str[7:]
# Remove any parameters after the '&' character in the URL string
if '&' in link_str:
i = link_str.find('&')
link_str = link_str[:i]
# Check if the URL string starts with 'http' and is not from Google
if link_str.startswith('http') and 'google' not in link_str:
# Exclude URLs from non-English Wikipedia pages
if 'wikipedia.org/wiki' in link_str and 'en' not in link_str:
continue
# Write URL to the file
f.write(link_str + '\n')
if 'wiki' in link_str:
wiki_counter += 1
counter += 1
# Exit the loop if the alreay crawl 40 relevant URLs
if counter == 40:
break
# Exit the loop if the alreay crawl 40 relevant URLs
if counter == 40:
break
print("Crawling completed")
# Step 2:
save_index = 1
with open('urls.txt', 'r') as f:
# Read the URLs
urls = f.readlines()
for url in urls:
url = url.strip()
# Save the text content of URLs
save_url_text(url, save_index)
save_index += 1
# Step 3:
clean_text_files(counter+1)
# Step 4:
files_path = [f'cleaned_data/{i}_cleaned.txt' for i in range(1, counter+1)]
# Get top terms
top_terms = extract_important_terms(files_path)
for term, score in top_terms:
print(term, score)
# Write top terms into the text file
with open('top_terms.txt', 'a') as f:
f.write(str(term) + " " + str(score) + '\n')
# Step 5: Build a searchable knowledge base
terms = ['jordan', 'michael', 'nba', 'game', 'team', 'player', 'basketball',
'season', 'bull', 'point', 'first', 'chicago', 'career', 'award', 'championship']
build_knowledge_base(terms, counter+1)
if __name__ == "__main__":
main()