Skip to content

Commit

Permalink
feat: added supabase integration on query_engine
Browse files Browse the repository at this point in the history
  • Loading branch information
AnsahMohammad committed May 4, 2024
1 parent 6608f9c commit ff3f621
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 15 deletions.
6 changes: 5 additions & 1 deletion build.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# build script for crawling and indexing

python3 -m venv .env
source .env/bin/activate

Expand All @@ -6,9 +8,11 @@ pip install -r requirements.txt

# Check if SUPABASE_URL and SUPABASE_KEY are set
if [ -z "$SUPABASE_URL" ] || [ -z "$SUPABASE_KEY" ]; then
sleep 1
echo "SUPABASE_URL and SUPABASE_KEY are not set. Crawling..."
python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 240
fi

echo "crawling done"
clear
echo "Installation done"
python3 -m nltk.downloader stopwords
Expand Down
4 changes: 3 additions & 1 deletion crawl.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# crawler script to crawl the given urls

python3 -m venv .env
source .env/bin/activate

pip install -r requirements.txt
python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://stackoverflow.com/questions" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" --show_logs True --print_logs True --sleep 600
python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 1800
4 changes: 3 additions & 1 deletion local_search.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# index the documents and run the query engine

source .env/bin/activate

pip install -r requirements.txt
clear
echo "Installation done"
python3 -m src.phantom_indexing
echo "Phantom Processing done"
echo "Phantom Indexing done"
clear
python3 -m src.query_engine
2 changes: 1 addition & 1 deletion phantom.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# setting up the query engine
app = Flask(__name__)
engine = Phantom_Query("indexed.json", titles="titles.json")
engine = Phantom_Query("indexed.json", title_path="titles.json")
parser = Parser()

@app.route("/", methods=["GET"])
Expand Down
2 changes: 2 additions & 0 deletions search.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# run the phantom flask app

python3 -m venv .env
source .env/bin/activate

Expand Down
2 changes: 1 addition & 1 deletion src/phantom.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def main(num_threads, urls, show_logs, print_logs, sleep):
print("urls: ", urls)
print("show_logs: ", show_logs)
print("print_logs: ", print_logs)
print("sleep: ", sleep)
print("sleep after: ", sleep)
phantom = Phantom(
num_threads=num_threads,
urls=urls,
Expand Down
6 changes: 3 additions & 3 deletions src/phantom_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(
self.id_root = {}

self.storage = Storage("index")
self.title_storage = Storage("titles")
self.title_storage = Storage("titles", remote_db=False)
self.visited_urls = self.storage.fetch_visited()

self.kill = False
Expand Down Expand Up @@ -199,12 +199,12 @@ def url_parser(self, url):


class Storage:
def __init__(self, table_name="index", resume=False):
def __init__(self, table_name="index", resume=False, remote_db=True):
self.table_name = table_name
self.data = {}

self.resume = resume
self.remote_db = True
self.remote_db = remote_db

# remote client set-up
self.url = os.environ.get("SUPABASE_URL", None)
Expand Down
60 changes: 53 additions & 7 deletions src/query_engine.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
import json
from collections import Counter
from .logger import Logger
import os
from supabase import create_client, Client



class Phantom_Query:
def __init__(self, filename="indexed.json", titles=None):
def __init__(self, filename="indexed.json", title_path=None):

self.showlogs = True
self.title_table = False

self.remote_db = self.check_remote()
self.logger = Logger(self.showlogs)
self.log = self.logger.log

self.data = {}
with open(filename, "r") as f:
self.data = json.load(f)

if titles:
if title_path or self.remote_db:
self.title_path = title_path
self.title_table = True
self.titles = {}
with open(titles, "r") as f:
self.titles = json.load(f)
if not self.load_titles():
self.remote_db = False
self.load_titles()

# self.tf = self.data["tf"]
self.idf = self.data["idf"]
self.tfidf = self.data["tfidf"]

self.logger = Logger(self.showlogs)
self.log = self.logger.log

self.lookup = set(self.idf.keys())
self.log("Query Engine Ready", "Query_Engine")
Expand Down Expand Up @@ -60,6 +66,46 @@ def run(self):
while True:
query = input("Enter the query : ")
print(self.query(query))

def check_remote(self):
remote_db = True

self.db_url = os.environ.get("SUPABASE_URL", None)
self.db_key = os.environ.get("SUPABASE_KEY", None)
try:
self.supabase = create_client(self.db_url, self.db_key)
if not self.supabase:
print("Failed to connect to Supabase")
remote_db = False
except Exception as e:
print(f"Error while creating Supabase client: {e}")
remote_db = False

print("Remote database : ", remote_db)
print("DB Ready")
return remote_db

def load_titles(self):
# load the titles from index.json
if self.remote_db:
try:
self.log("Fetching data from remote DB")
response = self.supabase.table("index").select("url", "title").execute()
for record in response.data:
self.titles[record["url"]] = record["title"]
self.log(f"Data fetched from remote DB: {len(self.titles)}", "Phantom-Indexer-Loader")
except Exception as e:
print(f"\nError fetching data from index table: {e}\n")
return False
return True

else:
if not self.title_path:
return False

self.log("Loading data from local file")
with open(self.title_path, "r") as f:
self.titles = json.load(f)


if __name__ == "__main__":
Expand Down

0 comments on commit ff3f621

Please sign in to comment.