-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_engine.py
208 lines (178 loc) · 6.04 KB
/
search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import time
start_time = 0
def display_time(timeTaken):
hr = int(timeTaken)/3600
min = int(timeTaken)/60
sec = int(timeTaken)%60
print("Total time elapsed :")
if(hr):
print hr,"hrs",
if(min):
print min,"mins",
if(sec):
print sec,"secs"
def get_page(url):
try:
import urllib2
request_headers = {
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"
}
request = urllib2.Request(url, headers = request_headers)
return urllib2.urlopen(request).read()
except:
return "error"
def compute_ranks(graph):
d = 0.8 #damping factor
loops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, loops):
newranks = {}
for page in graph:
newrank = (1-d) / npages
for node in graph:
if page in graph[node]:
newrank = newrank + d * (ranks[node] / len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
def get_all_links(text, links, crawled):
end = 0
new_links = []
while(1):
start = text.find("a href", end)
if start == -1:
return new_links #returns list of links on the current page
start = text.find('"', start)
end = text.find('"', start+1)
link = text[start+1:end]
#to ensure that a link is not crawled again (avoid condition of memory overflow)
if link not in crawled:
links.append(link)
new_links.append(link)
def add_to_index(index, text, url):
words = text.split()
for word in words:
if word not in index:
index[word] = []
if url not in index[word]:
index[word].append(url)
return index
def partition(s,ranks,start,end):
i=start+1
j=start+1
while(j<=end):
if(ranks[s[j]]>=ranks[s[start]]):
s[j],s[i]=s[i],s[j]
i+=1
j+=1
s[start],s[i-1]=s[i-1],s[start]
return i-1
def quicksort(s,ranks,start,end):
if(start<end):
pivot=partition(s,ranks,start,end)
quicksort(s,ranks,start,pivot-1)
quicksort(s,ranks,pivot+1,end)
def url_ordering(ranks, index):
for word in index:
quicksort(index[word], ranks, 0, len(index[word])-1)
def crawl_web(page_url):
global start_time
links = [page_url]
level = 1
links.append(level)
#print links
crawled = [] #stores the list of urls crawled
index = {} #dict storing ( keyword->list of urls associated )
graph = {} #dict storing ( url-> list of urls on that page )
while(len(links)):
url = links[0]
#print url
links = links[1:]
#print links
if(type(url) == int):
#print level
if(url>2 and int(time.time()-start_time)>10*60):
break
level+=1
links.append(level)
#print links
continue
crawled.append(url)
flag = 5
while(flag):
text=get_page(url) #html source code of the page
if text!="error": #check if page was sucessfully extracted/loaded
break #else try (max 5 times) till its not loaded properly
flag-=1
if not flag: #if page doesnt load , contains error
continue #then ignore it
add_to_index(index, text, url)
new_links = get_all_links(text, links, crawled)
graph[url] = new_links
ranks = compute_ranks(graph)
print("Ordering the results..\n")
url_ordering(ranks, index) #sorts the urls of each keyword
return index, ranks
def input_seedpage():
while(1):
url = raw_input("Enter the complete url of seed page to crawl the web : ")
if get_page(url)!="error":
return url
print("\nCouldn't connect to web, please check the url entered or try again later\n")
def lookup():
global start_time
seed_url = input_seedpage()
start_time = time.time()
print("\nIt will take some time ...")
print("\nfetching results ....")
index_sorted, ranks = crawl_web(seed_url)
print("Web Crawled and results are processed Successfully !!")
display_time(time.time() - start_time)
while(1):
word = raw_input("\nEnter the word to be searched : (-1 to exit) ")
if word == '-1':
exit()
if word not in index_sorted:
print("Sorry, no results found.")
continue
print("\nSome of the top results are as follows (with their ranks) : \n")
count = 10
for i in index_sorted[word]:
print i," rank= ",ranks[i],"\n"
count -= 1
if not count:
break
lookup()
'''
def compute_ranks(graph):
d=.8 #damping factor
loops=10
rank={}
dp={}
total_url=len(graph)
for url in graph:
if url not in dp:
dp[url]=[0 for x in range(0,loops)]
#print dp
dp[url][0]=1.0/total_url
for t in range(1,loops):
for url in graph:
dp[url][t]=(1-d)/total_url #constant value in expression
sum=0.0 #calculating the summation part
for i in graph:
if url in graph[i] and len(graph[i]):
sum+=dp[i][t-1]/len(graph[i])
sum*=d
dp[url][t]+=sum #total rank of page at t'th loop
print loops-1
for url in graph:
rank[url]=dp[url][loops-1]
return rank
'''