Skip to content

Commit

Permalink
Merge branch 'main' into loader
Browse files Browse the repository at this point in the history
  • Loading branch information
AnsahMohammad committed May 7, 2024
2 parents 5a234ee + f7d6a3c commit 2b6407d
Show file tree
Hide file tree
Showing 11 changed files with 88 additions and 76 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Byte-compiled / optimized / DLL files
__pycache__/
__pycache__
*.py[cod]
*$py.class

Expand Down
4 changes: 3 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ pip install -r requirements.txt
if [ -z "$SUPABASE_URL" ] || [ -z "$SUPABASE_KEY" ]; then
sleep 1
echo "SUPABASE_URL and SUPABASE_KEY are not set. Crawling..."
python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 240
python3 -m src.phantom --num_threads 5 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 240
else
echo "SUPABASE_URL and SUPABASE_KEY are set. Not crawling."
fi
echo "crawling done"
clear
Expand Down
13 changes: 9 additions & 4 deletions phantom.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,37 @@
engine = Phantom_Query("indexed.json", title_path="titles.json")
parser = Parser()


@app.route("/", methods=["GET"])
def home():
input_text = request.args.get('q', '')
input_text = request.args.get("q", "")
if input_text:
result = process_input(input_text)
return render_template("result.html", result=result, input_text=input_text)
return render_template("home.html")


def analytics(input_text):
if not REMOTE_DB:
return False
try:
data, count = supabase.table('queries').insert({"query": input_text}).execute()
data, count = supabase.table("queries").insert({"query": input_text}).execute()
except Exception as e:
print(f"\nError inserting record into 'queries' table: {e}\n")
return False
return True


def process_input(input_text):
result = engine.query(input_text, count=20) # (doc, score, title)
result = engine.query(input_text, count=20) # (doc, score, title)
analytics(input_text)
return result


@app.route("/health", methods=["GET"])
def health_check():
return "OK", 200


if __name__ == "__main__":
app.run()
app.run()
3 changes: 2 additions & 1 deletion src/phantom_child.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import socket
import threading
from .logger import Logger
from .utils.logger import Logger
from .phantom_engine import Parser
import time
import json
from collections import deque


class Crawler:
def __init__(self, server_host="0.0.0.0", server_port=9999):
self.server_host = server_host
Expand Down
60 changes: 3 additions & 57 deletions src/phantom_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import json
from .logger import Logger
from .utils.logger import Logger
from .utils.storage import Storage
from collections import deque
import os
from supabase import create_client, Client


class Phantom:
def __init__(
Expand Down Expand Up @@ -199,60 +199,6 @@ def url_parser(self, url):
return (title, cleaned_url)


class Storage:
def __init__(self, table_name="index", resume=False, remote_db=True):
self.table_name = table_name
self.data = {}

self.resume = resume
self.remote_db = remote_db

# remote client set-up
self.url = os.environ.get("SUPABASE_URL", None)
self.key = os.environ.get("SUPABASE_KEY", None)
try:
self.supabase = create_client(self.url, self.key)
if not self.supabase:
print("Failed to connect to Supabase")
self.remote_db = False
except Exception as e:
print(f"Error while creating Supabase client: {e}")
self.remote_db = False

print("Remote database : ", self.remote_db)
print("DB Ready")

def add(self, key, value, title=None):
if self.remote_db:
try:
data, count = self.supabase.table(self.table_name).insert({"url": key, "content": json.dumps(value), "title": title}).execute()
except Exception as e:
print(f"\nError inserting record into {self.table_name} table: {e}\n")
return False
return True

# print("value is of length : ", len(value))
self.data[key] = value

def fetch_visited(self):
visited = set()
if self.resume and self.remote_db:
# if resume the execution and remote db available
response = self.supabase.table('index').select('url').execute()
for row in response['data']:
visited.add(row['url'])
print("Visited URLs fetched from remote DB : ",len(visited))

return visited

def save(self):
if self.remote_db:
return
table_name = self.table_name + ".json"
with open(table_name, "w") as f:
json.dump(self.data, f)


# phantom = Phantom(num_threads=8,urls=["https://github.com/AnsahMohammad"], show_logs=True, print_logs=True)
# phantom.run()
# time.sleep(30)
Expand Down
7 changes: 3 additions & 4 deletions src/phantom_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from .logger import Logger
from .utils.logger import Logger
import os
from supabase import create_client, Client

Expand Down Expand Up @@ -34,7 +34,6 @@ def __init__(self, filename="index.json", out="indexed.json") -> None:
self.idf = {}
self.tfidf = {}


def calculate_tf(self):
self.log("Calculating TF", "Phantom-Indexer")
for doc, text in self.data.items():
Expand Down Expand Up @@ -98,7 +97,7 @@ def check_remote(self):
except Exception as e:
print(f"Error while creating Supabase client: {e}")
remote_db = False

print("Remote database : ", remote_db)
print("DB Ready")
return remote_db
Expand Down Expand Up @@ -126,11 +125,11 @@ def load(self):
return False
return True


else:
self.log("Loading data from local file")
with open(self.in_file, "r") as f:
self.data = json.load(f)


def save(self):
# data = {"tfidf": self.tfidf, "idf": self.idf, "tf": self.tf}
Expand Down
2 changes: 1 addition & 1 deletion src/phantom_master.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import socket
import threading
from .logger import Logger
from .utils.logger import Logger
import os
import json

Expand Down
12 changes: 5 additions & 7 deletions src/query_engine.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import json
from collections import Counter
from .logger import Logger
from .utils.logger import Logger
import os
from supabase import create_client, Client



class Phantom_Query:
def __init__(self, filename="indexed.json", title_path=None):

Expand All @@ -14,7 +13,7 @@ def __init__(self, filename="indexed.json", title_path=None):
self.remote_db = self.check_remote()
self.logger = Logger(self.showlogs)
self.log = self.logger.log

self.data = {}
with open(filename, "r") as f:
self.data = json.load(f)
Expand All @@ -31,7 +30,6 @@ def __init__(self, filename="indexed.json", title_path=None):
self.idf = self.data["idf"]
self.tfidf = self.data["tfidf"]


self.lookup = set(self.idf.keys())
self.log("Query Engine Ready", "Query_Engine")

Expand Down Expand Up @@ -66,7 +64,7 @@ def run(self):
while True:
query = input("Enter the query : ")
print(self.query(query))

def check_remote(self):
remote_db = True

Expand All @@ -80,11 +78,11 @@ def check_remote(self):
except Exception as e:
print(f"Error while creating Supabase client: {e}")
remote_db = False

print("Remote database : ", remote_db)
print("DB Ready")
return remote_db

def load_titles(self):
# load the titles from index.json
if self.remote_db:
Expand Down
Empty file added src/utils/__init__.py
Empty file.
File renamed without changes.
61 changes: 61 additions & 0 deletions src/utils/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
from supabase import create_client, Client
import json


class Storage:
def __init__(self, table_name="index", resume=False, remote_db=True):
self.table_name = table_name
self.data = {}

self.resume = resume
self.remote_db = remote_db

# remote client set-up
self.url = os.environ.get("SUPABASE_URL", None)
self.key = os.environ.get("SUPABASE_KEY", None)
try:
self.supabase = create_client(self.url, self.key)
if not self.supabase:
print("Failed to connect to Supabase")
self.remote_db = False
except Exception as e:
print(f"Error while creating Supabase client: {e}")
self.remote_db = False

print("Remote database : ", self.remote_db)
print("DB Ready")

def add(self, key, value, title=None):
if self.remote_db:
try:
data, count = (
self.supabase.table(self.table_name)
.insert({"url": key, "content": json.dumps(value), "title": title})
.execute()
)
except Exception as e:
print(f"\nError inserting record into {self.table_name} table: {e}\n")
return False
return True

# print("value is of length : ", len(value))
self.data[key] = value

def fetch_visited(self):
visited = set()
if self.resume and self.remote_db:
# if resume the execution and remote db available
response = self.supabase.table("index").select("url").execute()
for row in response["data"]:
visited.add(row["url"])
print("Visited URLs fetched from remote DB : ", len(visited))

return visited

def save(self):
if self.remote_db:
return
table_name = self.table_name + ".json"
with open(table_name, "w") as f:
json.dump(self.data, f)

0 comments on commit 2b6407d

Please sign in to comment.