From af4294f5fb4ea6b7c3beb310e496c57215c017cb Mon Sep 17 00:00:00 2001 From: Ansah Mohammad Date: Tue, 30 Apr 2024 22:07:54 +0530 Subject: [PATCH] feat: Search engine UI added (#3) * feat: initial commit * chore: cleanup * chore: reorg * feat: final and cleanup * chore: final touchups * feat: added build script * style: style fix * feat: updated readme and moved to src * feat: search engine added * feat: finsihed search Engine * chore: search engine Done --- .gitignore | 2 + Phantom_local/query_engine.py | 49 ---- README.md | 22 +- build.sh | 9 + crawl.sh | 3 +- phantom.py | 26 ++ phantom_crawler/phantom_engine.py | 276 ------------------ .../requirements.txt => requirements.txt | 2 + src/__init__.py | 0 {Phantom_local => src}/logger.py | 5 +- src/phantom.py | 40 +++ {Phantom_local => src}/phantom_engine.py | 100 ++++--- {Phantom_local => src}/phantom_indexing.py | 32 +- src/query_engine.py | 67 +++++ {phantom_crawler => src}/requirements.txt | 1 + templates/home.css | 3 + templates/home.html | 66 +++++ templates/result.html | 83 ++++++ 18 files changed, 402 insertions(+), 384 deletions(-) delete mode 100644 Phantom_local/query_engine.py create mode 100755 build.sh create mode 100644 phantom.py delete mode 100644 phantom_crawler/phantom_engine.py rename Phantom_local/requirements.txt => requirements.txt (52%) create mode 100644 src/__init__.py rename {Phantom_local => src}/logger.py (97%) create mode 100644 src/phantom.py rename {Phantom_local => src}/phantom_engine.py (78%) rename {Phantom_local => src}/phantom_indexing.py (77%) create mode 100644 src/query_engine.py rename {phantom_crawler => src}/requirements.txt (70%) create mode 100644 templates/home.css create mode 100644 templates/home.html create mode 100644 templates/result.html diff --git a/.gitignore b/.gitignore index 01b8d96..96e9bba 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,5 @@ cython_debug/ logs.txt index.json indexed.json +titles.json +.archive diff --git a/Phantom_local/query_engine.py b/Phantom_local/query_engine.py deleted file mode 100644 index 63c5976..0000000 --- a/Phantom_local/query_engine.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -from collections import Counter -from logger import Logger - -class Phantom_Query: - def __init__(self, filename=None): - - self.showlogs = True - - self.data = {} - with open(filename, "r") as f: - self.data = json.load(f) - - self.tf = self.data["tf"] - self.idf = self.data["idf"] - self.tfidf = self.data["tfidf"] - - self.logger = Logger(self.showlogs) - self.log = self.logger.log - - self.lookup = set(self.idf.keys()) - - def query(self, query): - self.log(f"Query recieved : {query}", "Query_Engine") - query = query.split() - query_len = len(query) - query = [term for term in query if term in self.lookup] - query_freq = Counter(query) - query_tfidf = {term: (query_freq[term]/query_len) * self.idf[term] for term in query} - - self.log(f"TF-idf of query : {query_tfidf}", "Query_Engine") - - scores = {} - for doc, tfidf in self.tfidf.items(): - score = sum(tfidf[term] * query_tfidf.get(term, 0.0) for term in tfidf) - scores[doc] = score - - ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True) - self.log(f"Ranked documents : {ranked_docs}", "Query_Engine") - - return ranked_docs - - -phant = Phantom_Query("indexed.json") - -while True: - query = input("Enter the query : ") - print(phant.query(query)) - diff --git a/README.md b/README.md index aa8b089..890a5fa 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ -# Phantom -Distributed Crawler Indexing Engine +# Phantom Search +Light weight python based search engine + +## Set-up +1) open `crawl.sh` and update the parameters + +```shell +python phantom.py --num_threads 8 --urls "site1.com" "site2.com" +``` +2) now run crawl.sh by typing +```shell +./crawl.sh +``` +This crawls the web and saves indices into `index.json` file + +3) run `build.sh` to Process the indices and run the `Query Engine` + +4) now everytime you can start the query engine by running the file `query_engine.py` + + diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..2213389 --- /dev/null +++ b/build.sh @@ -0,0 +1,9 @@ +source .env/bin/activate + +pip install -r requirements.txt +clear +echo "Installation done" +python3 -m src.phantom_indexing +echo "Phantom Processing done" +clear +python3 -m src.query_engine diff --git a/crawl.sh b/crawl.sh index 25a566f..adafdca 100755 --- a/crawl.sh +++ b/crawl.sh @@ -1,6 +1,5 @@ python3 -m venv .env source .env/bin/activate -cd phantom_crawler pip install -r requirements.txt -python3 phantom_engine.py +python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://stackoverflow.com/questions" --show_logs True --print_logs True --sleep 60 diff --git a/phantom.py b/phantom.py new file mode 100644 index 0000000..6ab58c5 --- /dev/null +++ b/phantom.py @@ -0,0 +1,26 @@ +from flask import Flask, render_template, request +from src.query_engine import Phantom_Query +from src.phantom_engine import Parser + +app = Flask(__name__) +engine = Phantom_Query("src/indexed.json", titles="src/titles.json") +parser = Parser() + +@app.route('/', methods=['GET', 'POST']) +def home(): + input_text = "" + if request.method == 'POST': + input_text = request.form.get('input_text') + result = process_input(input_text) + return render_template('result.html', result=result, input_text=input_text) + return render_template('home.html', input_text=input_text) + +def process_input(input_text): + result = engine.query(input_text, count=20) + #(doc, score, title) + print("results ; \n\n") + print(result) + return result + +if __name__ == '__main__': + app.run() \ No newline at end of file diff --git a/phantom_crawler/phantom_engine.py b/phantom_crawler/phantom_engine.py deleted file mode 100644 index 28ad580..0000000 --- a/phantom_crawler/phantom_engine.py +++ /dev/null @@ -1,276 +0,0 @@ -import threading -import time -import random -import requests -from bs4 import BeautifulSoup -from urllib.parse import urlparse, urljoin -import json - -class Phantom: - def __init__(self, url, num_threads=1, show_logs=False, print_logs=False, burnout=700): - print("Phantom Crawler Started") - - self.start_time = time.time() - self.print_logs = print_logs - self.thread_count = num_threads - self.show_logs = show_logs - self.BURNOUT = burnout - - self.url = url - self.threads = [] - self.id_root = {} - self.urls = set() - self.kill = False - self.logger = Logger(self.show_logs) - self.log = self.logger.log - self.storage = Storage() - - self.log("INIT-Phantom", "Phantom") - - def crawler(self, id, url): - burnout = self.BURNOUT - start_time = time.time() - local_urls = set() - traversed = [] - queue = [] - queue.append(url) - parser = Parser(self.show_logs) - epoch = 1 - - def status(): - self.log("Status requested", f"Crawler {id}") - status = f"Crawler {id} \n" - status += f"Root : {url} \n" - status += f"Epoch : {epoch} \n" - # status += f"Traversed : {traversed} \n" - status += f"Queue : {queue}" - - self.log(status, f"Crawler {id}") - - while queue and not self.kill: - if time.time() - start_time > burnout: - self.log("Burnout", f"Crawler {id}") - break - - if epoch % 100 == 0: - status() - local_urls = self.update_urls(local_urls, id) - - url = queue.pop(0) - - if url in local_urls: - self.log("Already scanned", f"Crawler {id}") - continue - - local_urls.add(url) - traversed.append(url) - self.log(f"Traversing {url}", f"Crawler {id}") - neighbors, content = parser.parse(url) - self.storage.add(url, content) - queue.extend(neighbors) - # self.log(f"Neighbors {neighbors}", f"Crawler {id}") - epoch += 1 - - queue.clear() - self.log("CRAWLER STOPPED", f"Crawler {id}") - - # def crawler(self, id, url): - # """Crawler using Crawler Object""" - # crawler = Crawler(url, id) - # while not self.kill: - # crawler.crawl() - # # crawler.skip() - - # crawler.kill() - - def update_urls(self, local_url, id): - """update the local_urls with global index""" - self.log("Updating URLs", f"Crawler {id}") - for url in local_url: - self.urls.add(url) - - return self.urls - - def run(self): - while len(self.threads) < self.thread_count: - self.generate(self.url) - - for thread in self.threads: - thread.start() - - def generate(self, url): - id = len(self.threads) + 1 - self.threads.append(threading.Thread(target=self.crawler, args=(id, url))) - self.id_root[id] = url - - def stop(self): - self.kill = True - self.log("STOP-Phantom Issued", "Phantom") - - for threads in self.threads: - threads.join() - - self.log("STOP-Phantom Stopped", "Phantom") - self.end() - - def stats(self): - self.log("Status requested ", "Phantom") - # stats function - print("Number of threads : ", self.thread_count) - print("Threads : ") - for thread in self.threads: - print(thread) - - print("thread : Root : ") - for id, root in self.id_root.items(): - print(f"{id} : {root}") - - print("Time Elapsed : ", time.time() - self.start_time) - print("Burnout : ", self.BURNOUT) - - def end(self): - # cleaning function - self.stats() - - self.storage.save() - self.log("Saved the indices", "Phantom") - - if self.print_logs: - self.logger.save() - - self.threads.clear() - self.id_root.clear() - print("Phantom Crawler Ended") - - -class Parser: - def __init__(self, show_logs): - self.show_logs = show_logs - self.log = Logger(self.show_logs).log - - def clean_url(self, url): - parsed = urlparse(url) - cleaned = parsed.scheme + "://" + parsed.netloc + parsed.path - return cleaned - - def fetch(self, url): - response = requests.get(url) - return response.content - - def parse(self, url): - self.log(f"parsing {url}", "Parser") - - cleaned_url = self.clean_url(url) - content = self.fetch(cleaned_url) - - soup = BeautifulSoup(content, 'html.parser') - - text = soup.get_text() - words = text.split() - links = [urljoin(url, link.get('href')) for link in soup.find_all('a')] - - return links, words - - -class Logger: - def __init__(self, show_logs=False): - self.show_logs = show_logs - self.logs = [] - - def log(self, content, id=None, **kwargs): - log_ = f"{time.strftime('%H:%M:%S')} : " - if id: - log_ += f"{id} : " - - log_ += f"{content} | {kwargs}" - - self.logs.append(log_) - if self.show_logs: - print(log_) - - def save(self): - with open("logs.txt", "w") as f: - for log in self.logs: - f.write(log + "\n") - self.log("Logs saved to logs.txt", "Log") - self.logs.clear() - -class Crawler: - def __init__(self, url, id): - self.id = id - self.url = url - self.running = True - self.kill = False - self.show_logs = True - self.traversed = set() - self.log = Logger(self.show_logs).log - self.parse = Parser().parse - - def status(self): - self.log("Status requested", f"Crawler {self.id}") - status = f"Crawler {self.id} \n" - status += "Status : {self.running} \n" - status += f"Root : {self.url} \n" - status += f"Traversed : {self.traversed} \n" - - print(status) - self.log(status, f"Crawler {self.id}") - - def crawl(self): - self.log("Crawling started", f"Crawler {self.id}") - queue = [] - queue.append(self.url) - - while queue and self.running and not self.kill: - url = queue.pop(0) - - if url in self.traversed: - self.log(f"Already traversed {url}", f"Crawler {self.id}") - continue - - - self.log(f"Traverse {self.url}", f"Crawler {self.id}") - self.traversed.add(self.url) - - neighbours = self.parse(self.url) - queue.extend(neighbours) - - self.running = False - self.log("Crawling stopped", f"Crawler {self.id}") - - def kill(self): - self.log("Kill issued", f"Crawler {self.id}") - self.kill = True - self.status() - - self.traversed.clear() - self.log("Crawler killed", f"Crawler {self.id}") - - def skip(self): - pass - - def pause(self): - self.log("Pause issued", f"Crawler {self.id}") - self.running = False - - def resume(self): - self.log("Resume issued", f"Crawler {self.id}") - self.running = True - -class Storage: - def __init__(self, filename="index.json"): - self.filename = filename - self.data = {} - - def add(self, key, value): - self.data[key] = value - - def save(self): - with open(self.filename, 'w') as f: - json.dump(self.data, f) - -phantom = Phantom("https://github.com/AnsahMohammad", 6, show_logs=True, print_logs=True) -phantom.run() -time.sleep(30) -phantom.stop() - diff --git a/Phantom_local/requirements.txt b/requirements.txt similarity index 52% rename from Phantom_local/requirements.txt rename to requirements.txt index 1ffaed5..cddaf8d 100644 --- a/Phantom_local/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ bs4==0.0.2 +Flask==3.0.3 +nltk==3.8.1 requests==2.31.0 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Phantom_local/logger.py b/src/logger.py similarity index 97% rename from Phantom_local/logger.py rename to src/logger.py index 7bce2ac..3c2d60f 100644 --- a/Phantom_local/logger.py +++ b/src/logger.py @@ -1,5 +1,6 @@ import time + class Logger: def __init__(self, show_logs=False): self.show_logs = show_logs @@ -9,13 +10,13 @@ def log(self, content, id=None, **kwargs): log_ = f"{time.strftime('%H:%M:%S')} : " if id: log_ += f"{id} : " - + log_ += f"{content} | {kwargs}" self.logs.append(log_) if self.show_logs: print(log_) - + def save(self, filename="logs.txt"): with open(filename, "w") as f: for log in self.logs: diff --git a/src/phantom.py b/src/phantom.py new file mode 100644 index 0000000..71cf78e --- /dev/null +++ b/src/phantom.py @@ -0,0 +1,40 @@ +import argparse +import time +from .phantom_engine import Phantom + + +def main(num_threads, urls, show_logs, print_logs, sleep): + print("Starting Phantom engine") + print("num_threads: ", num_threads) + print("urls: ", urls) + print("show_logs: ", show_logs) + print("print_logs: ", print_logs) + print("sleep: ", sleep) + phantom = Phantom( + num_threads=num_threads, + urls=urls, + show_logs=show_logs, + print_logs=print_logs, + burnout=sleep, + ) + phantom.run() + if sleep is not None: + time.sleep(sleep) + phantom.stop() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the Phantom engine.") + parser.add_argument("--num_threads", type=int, default=8, help="Number of threads.") + parser.add_argument("--urls", type=str, nargs="+", help="List of URLs.") + parser.add_argument( + "--show_logs", type=bool, default=True, help="Whether to show logs." + ) + parser.add_argument( + "--print_logs", type=bool, default=True, help="Whether to print logs." + ) + parser.add_argument("--sleep", type=int, default=300, help="Sleep time in seconds.") + + args = parser.parse_args() + + main(args.num_threads, args.urls, args.show_logs, args.print_logs, args.sleep) diff --git a/Phantom_local/phantom_engine.py b/src/phantom_engine.py similarity index 78% rename from Phantom_local/phantom_engine.py rename to src/phantom_engine.py index aa67946..df4d2c1 100644 --- a/Phantom_local/phantom_engine.py +++ b/src/phantom_engine.py @@ -5,26 +5,32 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import json -from logger import Logger +from .logger import Logger + class Phantom: - def __init__(self, url, num_threads=1, show_logs=False, print_logs=False, burnout=700): + def __init__( + self, urls, num_threads=1, show_logs=False, print_logs=False, burnout=700 + ): print("Phantom Crawler Started") - self.start_time = time.time() self.print_logs = print_logs self.thread_count = num_threads self.show_logs = show_logs self.BURNOUT = burnout - - self.url = url + self.urls = urls + + self.len_urls = len(self.urls) + self.start_time = time.time() + self.url = urls[0] self.threads = [] self.id_root = {} - self.urls = set() + self.visited_urls = set() self.kill = False self.logger = Logger(self.show_logs) self.log = self.logger.log self.storage = Storage() + self.title_storage = Storage("src/titles.json") self.log("INIT-Phantom", "Phantom") @@ -44,7 +50,7 @@ def status(): status += f"Root : {url} \n" status += f"Epoch : {epoch} \n" # status += f"Traversed : {traversed} \n" - status += f"Queue : {queue}" + # status += f"Queue : {queue}" self.log(status, f"Crawler {id}") @@ -58,20 +64,23 @@ def status(): local_urls = self.update_urls(local_urls, id) url = queue.pop(0) + # clean the url + url = parser.clean_url(url) if url in local_urls: self.log("Already scanned", f"Crawler {id}") continue - + local_urls.add(url) traversed.append(url) self.log(f"Traversing {url}", f"Crawler {id}") - neighbors, content = parser.parse(url) + neighbors, content, url, title = parser.parse(url) self.storage.add(url, content) + self.title_storage.add(url, title) queue.extend(neighbors) # self.log(f"Neighbors {neighbors}", f"Crawler {id}") epoch += 1 - + queue.clear() self.log("CRAWLER STOPPED", f"Crawler {id}") @@ -83,20 +92,20 @@ def status(): # # crawler.skip() # crawler.kill() - + def update_urls(self, local_url, id): """update the local_urls with global index""" self.log("Updating URLs", f"Crawler {id}") for url in local_url: - self.urls.add(url) + self.visited_urls.add(url) - return self.urls + return self.visited_urls def run(self): while len(self.threads) < self.thread_count: - self.generate(self.url) + self.generate(self.urls[random.randint(0, self.len_urls - 1)]) - for thread in self.threads: + for thread in self.threads: thread.start() def generate(self, url): @@ -107,10 +116,10 @@ def generate(self, url): def stop(self): self.kill = True self.log("STOP-Phantom Issued", "Phantom") - + for threads in self.threads: threads.join() - + self.log("STOP-Phantom Stopped", "Phantom") self.end() @@ -121,30 +130,33 @@ def stats(self): print("Threads : ") for thread in self.threads: print(thread) - + print("thread : Root : ") for id, root in self.id_root.items(): print(f"{id} : {root}") - + print("Time Elapsed : ", time.time() - self.start_time) - print("Burnout : ", self.BURNOUT) + print("Burnout : ", self.BURNOUT) def end(self): # cleaning function self.stats() - + self.storage.save() self.log("Saved the indices", "Phantom") + self.title_storage.save() + self.log("Saved the titles", "Phantom") if self.print_logs: self.logger.save() - + self.threads.clear() self.id_root.clear() print("Phantom Crawler Ended") + class Parser: - def __init__(self, show_logs): + def __init__(self, show_logs=True): self.show_logs = show_logs self.log = Logger(self.show_logs).log @@ -160,16 +172,29 @@ def fetch(self, url): def parse(self, url): self.log(f"parsing {url}", "Parser") - cleaned_url = self.clean_url(url) - content = self.fetch(cleaned_url) + # cleaned_url = self.clean_url(url) since already cleaned disabled + content = self.fetch(url) + + soup = BeautifulSoup(content, "html.parser") - soup = BeautifulSoup(content, 'html.parser') + title = soup.title.string if soup.title else None text = soup.get_text() words = text.split() - links = [urljoin(url, link.get('href')) for link in soup.find_all('a')] + links = [urljoin(url, link.get("href")) for link in soup.find_all("a")] + + return links, words, url, title + + def url_parser(self, url): + self.log(f"parsing {url}", "Parser") + + cleaned_url = self.clean_url(url) + content = self.fetch(cleaned_url) + + soup = BeautifulSoup(content, "html.parser") + title = soup.title.string + return (title, cleaned_url) - return links, words class Crawler: def __init__(self, url, id): @@ -199,21 +224,20 @@ def crawl(self): while queue and self.running and not self.kill: url = queue.pop(0) - + if url in self.traversed: self.log(f"Already traversed {url}", f"Crawler {self.id}") continue - self.log(f"Traverse {self.url}", f"Crawler {self.id}") self.traversed.add(self.url) - + neighbours = self.parse(self.url) queue.extend(neighbours) - + self.running = False self.log("Crawling stopped", f"Crawler {self.id}") - + def kill(self): self.log("Kill issued", f"Crawler {self.id}") self.kill = True @@ -228,13 +252,14 @@ def skip(self): def pause(self): self.log("Pause issued", f"Crawler {self.id}") self.running = False - + def resume(self): self.log("Resume issued", f"Crawler {self.id}") self.running = True + class Storage: - def __init__(self, filename="index.json"): + def __init__(self, filename="src/index.json"): self.filename = filename self.data = {} @@ -242,10 +267,11 @@ def add(self, key, value): self.data[key] = value def save(self): - with open(self.filename, 'w') as f: + with open(self.filename, "w") as f: json.dump(self.data, f) -# phantom = Phantom("https://github.com/AnsahMohammad", 6, show_logs=True, print_logs=True) + +# phantom = Phantom(num_threads=8,urls=["https://github.com/AnsahMohammad"], show_logs=True, print_logs=True) # phantom.run() # time.sleep(30) # phantom.stop() diff --git a/Phantom_local/phantom_indexing.py b/src/phantom_indexing.py similarity index 77% rename from Phantom_local/phantom_indexing.py rename to src/phantom_indexing.py index 583dae1..f1250ae 100644 --- a/Phantom_local/phantom_indexing.py +++ b/src/phantom_indexing.py @@ -1,27 +1,28 @@ from collections import Counter import math import json -import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string -from logger import Logger +from .logger import Logger # nltk.download('punkt') # nltk.download('stopwords') + class PhantomIndexer: def __init__(self, filename) -> None: self.data = {} with open(filename, "r") as f: self.data = json.load(f) - + self.documents = len(self.data.keys()) self.tf = {} self.idf = {} self.tfidf = {} + self.showlogs = True self.logger = Logger(self.showlogs) self.log = self.logger.log @@ -31,14 +32,14 @@ def calculate_tf(self): for doc, text in self.data.items(): tf_text = Counter(text) for i in tf_text: - tf_text[i] = tf_text[i]/float(len(text)) + tf_text[i] = tf_text[i] / float(len(text)) self.tf[doc] = tf_text def calculate_idf(self): self.log("Calculating IDF", "Phantom-Indexer") idf_text = Counter([word for doc in self.tf.values() for word in doc]) for i in idf_text: - idf_text[i] = math.log10(self.documents/float(idf_text[i])) + idf_text[i] = math.log10(self.documents / float(idf_text[i])) self.idf = idf_text def calculate_tfidf(self): @@ -54,12 +55,12 @@ def calculate_tfidf(self): def process(self): self.log("Processing Data", "Phantom-Indexer") stemmer = PorterStemmer() - stop_words = set(stopwords.words('english')) + stop_words = set(stopwords.words("english")) for doc, words in self.data.items(): processed_words = [] for word in words: - word = word.lower().translate(str.maketrans('', '', string.punctuation)) + word = word.lower().translate(str.maketrans("", "", string.punctuation)) if word not in stop_words and len(word) < 30: stemmed_word = stemmer.stem(word) processed_words.append(stemmed_word) @@ -70,22 +71,21 @@ def process(self): self.calculate_tfidf() for doc in self.tfidf: - self.tfidf[doc] = dict(sorted(self.tfidf[doc].items(), key=lambda x: x[1], reverse=True)) + self.tfidf[doc] = dict( + sorted(self.tfidf[doc].items(), key=lambda x: x[1], reverse=True) + ) return self.tfidf def save(self): - data = { - "tfidf": self.tfidf, - "idf": self.idf, - "tf": self.tf - } - with open("indexed.json", "w") as f: + data = {"tfidf": self.tfidf, "idf": self.idf, "tf": self.tf} + with open("src/indexed.json", "w") as f: json.dump(data, f) - + self.log("Data Saved", "Phantom-Indexer") -processor = PhantomIndexer("index.json") + +processor = PhantomIndexer("src/index.json") processor.process() processor.save() print("Indexing completed!") diff --git a/src/query_engine.py b/src/query_engine.py new file mode 100644 index 0000000..8c09912 --- /dev/null +++ b/src/query_engine.py @@ -0,0 +1,67 @@ +import json +from collections import Counter +from .logger import Logger + + +class Phantom_Query: + def __init__(self, filename="src/indexed.json", titles = None): + + self.showlogs = True + self.title_table = False + + self.data = {} + with open(filename, "r") as f: + self.data = json.load(f) + + if titles: + self.title_table = True + self.titles = {} + with open(titles, "r") as f: + self.titles = json.load(f) + + self.tf = self.data["tf"] + self.idf = self.data["idf"] + self.tfidf = self.data["tfidf"] + + self.logger = Logger(self.showlogs) + self.log = self.logger.log + + self.lookup = set(self.idf.keys()) + self.log("Query Engine Ready", "Query_Engine") + + def query(self, query, count = 10): + self.log(f"Query recieved : {query}", "Query_Engine") + query = query.split() + query_len = len(query) + query = [term for term in query if term in self.lookup] + query_freq = Counter(query) + query_tfidf = { + term: (query_freq[term] / query_len) * self.idf[term] for term in query + } + + self.log(f"TF-idf of query : {query_tfidf}", "Query_Engine") + + scores = {} + for doc, tfidf in self.tfidf.items(): + score = sum(tfidf[term] * query_tfidf.get(term, 0.0) for term in tfidf) + scores[doc] = score + + ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True) + self.log(f"Ranked documents : {ranked_docs[:count]}", "Query_Engine") + + final_results = [] + for doc, score in ranked_docs[:count]: + title = self.titles[doc] if self.title_table else None + final_results.append((doc, score, title)) + + return final_results + + def run(self): + while True: + query = input("Enter the query : ") + print(self.query(query)) + + +if __name__ == "__main__": + query_engine = Phantom_Query("src/indexed.json") + query_engine.run() diff --git a/phantom_crawler/requirements.txt b/src/requirements.txt similarity index 70% rename from phantom_crawler/requirements.txt rename to src/requirements.txt index 1ffaed5..8b26f35 100644 --- a/phantom_crawler/requirements.txt +++ b/src/requirements.txt @@ -1,2 +1,3 @@ bs4==0.0.2 +nltk==3.8.1 requests==2.31.0 diff --git a/templates/home.css b/templates/home.css new file mode 100644 index 0000000..6c7d664 --- /dev/null +++ b/templates/home.css @@ -0,0 +1,3 @@ +body { + background-color: rgba(40, 37, 37, 0.439); +} \ No newline at end of file diff --git a/templates/home.html b/templates/home.html new file mode 100644 index 0000000..850c243 --- /dev/null +++ b/templates/home.html @@ -0,0 +1,66 @@ + + + + + + Phantom + + + + + + + +
+

Phantom

+
+ + +
+
+ + diff --git a/templates/result.html b/templates/result.html new file mode 100644 index 0000000..b8bfe47 --- /dev/null +++ b/templates/result.html @@ -0,0 +1,83 @@ + + + + + + Phantom + + + + + + + +
+

Phantom

+
+ + +
+
+
+
+ {% for item in result %} + +

{{ item[2] }}

+
+ {% endfor %} +
+ +