refactor: Code organization and Pagination (#12)

* feat: changed loader to load whole data rather than 1000 rows from supabase * refactor: moved master-child to distrib * refactor: rename src to phantom * refactor: updated the crawl.yaml * refactor: moved parser from engine to utils * chore: final comments * docs: updated paths
AnsahMohammad · May 8, 2024 · 6a0f991 · 6a0f991
1 parent f7d6a3c
commit 6a0f991
Show file tree

Hide file tree

Showing 19 changed files with 88 additions and 77 deletions.
diff --git a/.github/workflows/crawl.yaml b/.github/workflows/crawl.yaml
@@ -24,12 +24,12 @@ jobs:
     - name: Test crawler and indexer
       run: |
         source .env/bin/activate
-        python3 -m src.phantom --num_threads 2 --urls "https://github.com/AnsahMohammad" "https://github.com/AnsahMohammad" --sleep 6
+        python3 -m phantom.phantom --num_threads 2 --urls "https://github.com/AnsahMohammad" "https://github.com/AnsahMohammad" --sleep 6
 
         python3 -m nltk.downloader stopwords
         python3 -m nltk.downloader punkt
 
-        python3 -m src.phantom_indexing
+        python3 -m phantom.phantom_indexing
 
         echo "Crawling completed"
       

diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -24,7 +24,7 @@ There are two ways you can crawl the websites to save the indexes
 
 ## 1) Multithreaded Crawlers
 
-The multithreaded crawler is implemented in the `Phantom` class in the `src/phantom.py` file. It uses multiple threads to crawl websites concurrently, which significantly speeds up the crawling process.
+The multithreaded crawler is implemented in the `Phantom` class in the `phantom/phantom.py` file. It uses multiple threads to crawl websites concurrently, which significantly speeds up the crawling process.
 
 Here's a brief overview of how it works:
 
@@ -40,7 +40,7 @@ Here's a brief overview of how it works:
 
 - The `stop` method can be used to stop the crawling process. It sets a `kill` flag that causes the `crawler` methods to stop, waits for all threads to finish, and then saves the crawled data and prints some statistics.
 
-You can start the program by running the script on `src/phantom.py`. It uses `phantom_engine.py` to crawl the sites using multiple threads.
+You can start the program by running the script on `phantom/phantom.py`. It uses `phantom_engine.py` to crawl the sites using multiple threads.
 
 
 ## 2) Distributed Crawler system

diff --git a/build.sh b/build.sh
@@ -10,7 +10,7 @@ pip install -r requirements.txt
 if [ -z "$SUPABASE_URL" ] || [ -z "$SUPABASE_KEY" ]; then
     sleep 1
     echo "SUPABASE_URL and SUPABASE_KEY are not set. Crawling..."
-    python3 -m src.phantom --num_threads 5 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 240
+    python3 -m phantom.phantom --num_threads 5 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 240
 else
     echo "SUPABASE_URL and SUPABASE_KEY are set. Not crawling."
 fi
@@ -20,7 +20,7 @@ echo "Installation done"
 python3 -m nltk.downloader stopwords
 python3 -m nltk.downloader punkt
 ls
-python3 -m src.phantom_indexing
+python3 -m phantom.phantom_indexing
 echo "Phantom Processing done"
 clear
 echo "Build done"
diff --git a/crawl.sh b/crawl.sh
@@ -4,4 +4,4 @@ python3 -m venv .env
 source .env/bin/activate
 
 pip install -r requirements.txt
-python3 -m src.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 1800
+python3 -m phantom.phantom --num_threads 10 --urls "https://www.geeksforgeeks.org/" "https://en.wikipedia.org/wiki/India" "https://developers.cloudflare.com/" "https://bloggingidol.com/best-programming-blogs/" "https://www.hindustantimes.com/india-news/" "https://www.bbc.com/news" --show_logs True --print_logs True --sleep 1800
diff --git a/local_search.sh b/local_search.sh
@@ -5,7 +5,7 @@ source .env/bin/activate
 pip install -r requirements.txt
 clear
 echo "Installation done"
-python3 -m src.phantom_indexing
+python3 -m phantom.phantom_indexing
 echo "Phantom Indexing done"
 clear
-python3 -m src.query_engine
+python3 -m phantom.query_engine
diff --git a/phantom.py b/phantom.py
@@ -1,7 +1,7 @@
 import os
 from flask import Flask, render_template, request, redirect, url_for
-from src.query_engine import Phantom_Query
-from src.phantom_engine import Parser
+from phantom.query_engine import Phantom_Query
+from phantom.phantom_engine import Parser
 from supabase import create_client, Client
 
 # setting up database

diff --git a/src/__init__.py → phantom/__init__.py b/src/__init__.py → phantom/__init__.py
diff --git a/src/utils/__init__.py → phantom/distrib/__init__.py b/src/utils/__init__.py → phantom/distrib/__init__.py
diff --git a/src/phantom_child.py → phantom/distrib/child.py b/src/phantom_child.py → phantom/distrib/child.py
@@ -1,7 +1,7 @@
 import socket
 import threading
-from .utils.logger import Logger
-from .phantom_engine import Parser
+from ..utils.logger import Logger
+from ..phantom_engine import Parser
 import time
 import json
 from collections import deque

diff --git a/src/phantom_master.py → phantom/distrib/master.py b/src/phantom_master.py → phantom/distrib/master.py
@@ -1,6 +1,6 @@
 import socket
 import threading
-from .utils.logger import Logger
+from ..utils.logger import Logger
 import os
 import json
 

diff --git a/src/phantom.py → phantom/phantom.py b/src/phantom.py → phantom/phantom.py
diff --git a/src/phantom_engine.py → phantom/phantom_engine.py b/src/phantom_engine.py → phantom/phantom_engine.py
@@ -1,13 +1,10 @@
 import threading
 import time
 import random
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urljoin
-import json
 from .utils.logger import Logger
 from .utils.storage import Storage
 from collections import deque
+from .utils.parser import Parser
 
 
 class Phantom:
@@ -157,48 +154,6 @@ def end(self):
         self.id_root.clear()
         print("Phantom Crawler Ended")
 
-
-class Parser:
-    def __init__(self, show_logs=True):
-        self.show_logs = show_logs
-        self.log = Logger(self.show_logs).log
-
-    def clean_url(self, url):
-        parsed = urlparse(url)
-        cleaned = parsed.scheme + "://" + parsed.netloc + parsed.path
-        return cleaned
-
-    def fetch(self, url):
-        response = requests.get(url)
-        return response.content
-
-    def parse(self, url):
-        self.log(f"parsing {url}", "Parser")
-
-        # cleaned_url = self.clean_url(url)   since already cleaned disabled
-        content = self.fetch(url)
-
-        soup = BeautifulSoup(content, "html.parser")
-
-        title = soup.title.string if soup.title else None
-
-        text = soup.get_text()
-        words = text.split()
-        links = [urljoin(url, link.get("href")) for link in soup.find_all("a")]
-
-        return links, words, url, title
-
-    def url_parser(self, url):
-        self.log(f"parsing {url}", "Parser")
-
-        cleaned_url = self.clean_url(url)
-        content = self.fetch(cleaned_url)
-
-        soup = BeautifulSoup(content, "html.parser")
-        title = soup.title.string
-        return (title, cleaned_url)
-
-
 # phantom = Phantom(num_threads=8,urls=["https://github.com/AnsahMohammad"], show_logs=True, print_logs=True)
 # phantom.run()
 # time.sleep(30)

diff --git a/src/phantom_indexing.py → phantom/phantom_indexing.py b/src/phantom_indexing.py → phantom/phantom_indexing.py
@@ -12,7 +12,6 @@
 # nltk.download('punkt')
 # nltk.download('stopwords')
 
-
 class PhantomIndexer:
     def __init__(self, filename="index.json", out="indexed.json") -> None:
         self.out_file = out
@@ -107,20 +106,25 @@ def load(self):
         if self.remote_db:
             try:
                 self.log("Fetching data from remote DB")
-                response = (
-                    self.supabase.table("index").select("url", "content").execute()
-                )
-                for record in response.data:
-                    self.data[record["url"]] = json.loads(record["content"])
-                self.log(
-                    f"Data fetched from remote DB: {len(self.data)}",
-                    "Phantom-Indexer-Loader",
-                )
+                start = 0
+                end = 999
+                while True:
+                    response = self.supabase.table("index").select("url", "content").range(start, end).execute()
+                    if not response.data:
+                        break
+                    for record in response.data:
+                        self.data[record["url"]] = json.loads(record["content"])
+                    start += 1000
+                    end += 1000
+                    self.log(f"Data fetched from remote DB: {len(self.titles)}", "Phantom-Indexer-Loader")
+
+                self.log(f"Data fetched from remote DB: {len(self.data)}", "Phantom-Indexer-Loader")
             except Exception as e:
                 print(f"\nError fetching data from index table: {e}\n")
                 return False
             return True
 
+
         else:
             self.log("Loading data from local file")
             with open(self.in_file, "r") as f:

diff --git a/src/query_engine.py → phantom/query_engine.py b/src/query_engine.py → phantom/query_engine.py
@@ -88,13 +88,18 @@ def load_titles(self):
         if self.remote_db:
             try:
                 self.log("Fetching data from remote DB")
-                response = self.supabase.table("index").select("url", "title").execute()
-                for record in response.data:
-                    self.titles[record["url"]] = record["title"]
-                self.log(
-                    f"Data fetched from remote DB: {len(self.titles)}",
-                    "Phantom-Indexer-Loader",
-                )
+                start = 0
+                end = 999
+                while True:
+                    response = self.supabase.table("index").select("url", "title").range(start, end).execute()
+                    if not response.data:
+                        break
+                    for record in response.data:
+                        self.titles[record["url"]] = record["title"]
+                    start += 1000
+                    end += 1000
+                    self.log(f"Data fetched from remote DB: {len(self.titles)}", "Phantom-Indexer-Loader")
+                self.log(f"Data fetched from remote DB: {len(self.titles)}", "Phantom-Indexer-Loader")
             except Exception as e:
                 print(f"\nError fetching data from index table: {e}\n")
                 return False

diff --git a/src/requirements.txt → phantom/requirements.txt b/src/requirements.txt → phantom/requirements.txt
diff --git a/phantom/utils/__init__.py b/phantom/utils/__init__.py
diff --git a/src/utils/logger.py → phantom/utils/logger.py b/src/utils/logger.py → phantom/utils/logger.py
diff --git a/phantom/utils/parser.py b/phantom/utils/parser.py
@@ -0,0 +1,46 @@
+from bs4 import BeautifulSoup
+from .logger import Logger
+from urllib.parse import urlparse, urljoin
+import json
+import requests
+
+
+class Parser:
+    def __init__(self, show_logs=True):
+        self.show_logs = show_logs
+        self.log = Logger(self.show_logs).log
+
+    def clean_url(self, url):
+        parsed = urlparse(url)
+        cleaned = parsed.scheme + "://" + parsed.netloc + parsed.path
+        return cleaned
+
+    def fetch(self, url):
+        response = requests.get(url)
+        return response.content
+
+    def parse(self, url):
+        self.log(f"parsing {url}", "Parser")
+
+        # cleaned_url = self.clean_url(url)   since already cleaned disabled
+        content = self.fetch(url)
+
+        soup = BeautifulSoup(content, "html.parser")
+
+        title = soup.title.string if soup.title else None
+
+        text = soup.get_text()
+        words = text.split()
+        links = [urljoin(url, link.get("href")) for link in soup.find_all("a")]
+
+        return links, words, url, title
+
+    def url_parser(self, url):
+        self.log(f"parsing {url}", "Parser")
+
+        cleaned_url = self.clean_url(url)
+        content = self.fetch(cleaned_url)
+
+        soup = BeautifulSoup(content, "html.parser")
+        title = soup.title.string
+        return (title, cleaned_url)
diff --git a/src/utils/storage.py → phantom/utils/storage.py b/src/utils/storage.py → phantom/utils/storage.py
@@ -27,6 +27,7 @@ def __init__(self, table_name="index", resume=False, remote_db=True):
         print("DB Ready")
 
     def add(self, key, value, title=None):
+        # TODO: Genaralize the function to accept any table name
         if self.remote_db:
             try:
                 data, count = (