added pdf to text extraction module in the scrapemore piepline

sigdelsanjog · Nov 26, 2024 · be9b370 · be9b370
1 parent fe4b2e2
commit be9b370
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 40 deletions.
diff --git a/app/config/file_utils.py b/app/config/file_utils.py
@@ -0,0 +1,8 @@
+import os
+
+temp = 'temp'
+def create_temp_folder(directory=temp):
+    """Check if the temp directory exists, create it if not."""
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    return directory
diff --git a/app/main.py b/app/main.py
@@ -1,21 +1,28 @@
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse
-from app.config.models import URLRequest, URLResponse, URLListRequest
-from app.scraper import fetch_links, write_links_to_csv, extract_unique_categories, extract_unique_pages, extract_unique_tags
+from fastapi.responses import FileResponse, JSONResponse
+from config.models import URLRequest, URLResponse, URLListRequest
+from scraper import fetch_links, write_links_to_csv, extract_unique_categories, extract_unique_pages, extract_unique_tags
 import logging
+import sys
 import os
 import json
 import asyncio
 import aiohttp
 from aiohttp import ClientError, ClientTimeout
 
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from packages.pdfextract.routes import router as pdf_router
+
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 app = FastAPI()
 
+app.include_router(pdf_router, prefix="/pdfextract", tags=["PDF Extraction"])
+
+
 # Configure CORS
 app.add_middleware(
     CORSMiddleware,
@@ -28,91 +35,126 @@
 # Define constants
 TIMEOUT = 10  # Timeout for each URL request in seconds
 
+@app.post("/scrape-unique-links-in-categories/", response_model=dict)
+async def scrape_unique_links_in_categories(request: URLListRequest, background_tasks: BackgroundTasks):
+    """
+    Fetch all URLs from each category page and then scrape their contents concurrently.
+    """
+    try:
+        if not request.urls:
+            raise ValueError("No category URLs provided in the request.")
+        logger.info(f"Scraping the following categories: {request.urls}")
+
+        categories_with_links = {}
+        for category_url in request.urls:
+            logger.info(f"Fetching links from category URL: {category_url}")
+
+            # Fetch all links within the category page only once
+            category_links = await fetch_links(category_url, set())
+            logger.info(f"Found {len(category_links)} links in {category_url}")
+
+            # Enumerate the links found
+            urls_to_scrape = [link['link'] for link in category_links if isinstance(link, dict) and 'link' in link]
+            logger.info(f"URLs to scrape from {category_url}: {urls_to_scrape}")
+
+            if not urls_to_scrape:
+                logger.warning(f"No URLs found to scrape for category: {category_url}")
+
+            # Scrape contents of all URLs found within the category concurrently
+            scraped_contents = await asyncio.gather(
+                *[scrape_single_url(url) for url in urls_to_scrape],
+                return_exceptions=True  # Capture errors individually
+            )
+
+            # Store results in the dictionary
+            categories_with_links[category_url] = {}
+            for result in scraped_contents:
+                if isinstance(result, tuple) and len(result) == 2:
+                    url, content = result
+                    categories_with_links[category_url][url] = content
+                    logger.info(f"Scraped content from {url}")
+                elif isinstance(result, Exception):
+                    logger.error(f"Error scraping URL: {result}")
+
+        # Check if any content was scraped successfully
+        if not categories_with_links:
+            raise ValueError("No content could be scraped from the provided URLs.")
+
+        # Save the scraped contents into a JSON file
+        json_file_path = "output.json"
+        with open(json_file_path, "w") as json_file:
+            json.dump(categories_with_links, json_file, indent=4)
+
+        # Clean up the JSON file after the response
+        background_tasks.add_task(clean_up_file, json_file_path)
+        return FileResponse(
+            path=json_file_path, filename="output.json", media_type="application/json"
+        )
+    except Exception as e:
+        logger.error(f"Error scraping unique links in categories: {e}")
+        raise HTTPException(status_code=500, detail="Failed to scrape unique links in categories.")
+
 @app.post("/analyze", response_model=URLResponse)
 async def analyze_url(request: URLRequest):
     try:
         visited_links = set()
         all_links = await fetch_links(request.url, visited_links)
 
-        # Categorize the links into Pages, Tags, and Categories
         pages = extract_unique_pages(all_links)
         tags = extract_unique_tags(all_links)
         categories = extract_unique_categories(all_links)
 
-        # Prepare a flat list of URLs for response
         response_urls = []
-
-        # Flatten the categories, pages, and tags into individual URL entries
         for page in pages:
             response_urls.append({"category": "Page", "url": page['link']})
-
         for tag in tags:
             response_urls.append({"category": tag['category'], "url": tag['link']})
-
         for category in categories:
             response_urls.append({"category": category['category'], "url": category['link']})
 
-        # Log the prepared response
         logger.info(f"Prepared response: {response_urls}")
-
         return {"urls": response_urls}
-
     except Exception as e:
         logger.error(f"Error in /analyze: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
-
 @app.post("/scrape-links/")
 async def scrape_links(request: URLListRequest, background_tasks: BackgroundTasks):
     """
     Fetch and scrape links based on the provided list of URLs.
     """
     try:
-        # Extract URLs from the URLListRequest model
         urls = request.urls
         visited_links = set()
-
-        # Log the URLs to be scraped
         logger.info(f"Scraping the following URLs: {urls}")
 
-        # Fetch links for each URL
         all_links = []
         for url in urls:
             links = await fetch_links(url, visited_links)
             all_links.extend(links)
 
-        # Write the links to a CSV file
         csv_file_path = await write_links_to_csv(all_links)
-
-        # Return the CSV file as a downloadable response
         background_tasks.add_task(clean_up_file, csv_file_path)
         return FileResponse(path=csv_file_path, filename="unique_links.csv", media_type="text/csv")
-
     except Exception as e:
         logger.error(f"Error scraping links: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error scraping links: {str(e)}")
 
-
 @app.post("/scrape-all-urls/")
-async def scrape_url_content(request: URLListRequest, background_tasks: BackgroundTasks):
+async def scrape_all_urls(request: URLListRequest, background_tasks: BackgroundTasks):
     """
     Scrape the contents of multiple URLs concurrently and save them into a JSON file.
     """
     try:
-        # Check if URLs are provided in the request
         if not request.urls:
             raise ValueError("No URLs provided in the request.")
-
-        # Log the list of URLs to be scraped
         logger.info(f"Starting to scrape the following URLs: {request.urls}")
 
-        # Run scraping tasks concurrently using asyncio.gather
         results = await asyncio.gather(
             *[scrape_single_url(url) for url in request.urls],
-            return_exceptions=True  # This allows capturing errors individually
+            return_exceptions=True
         )
 
-        # Create a dictionary with URL contents, filtering out failed scrapes
         url_contents = {}
         for result in results:
             if isinstance(result, tuple) and len(result) == 2:
@@ -121,26 +163,21 @@ async def scrape_url_content(request: URLListRequest, background_tasks: Backgrou
             elif isinstance(result, Exception):
                 logger.error(f"Error during scraping: {result}")
 
-        # Check if any results were scraped successfully
         if not url_contents:
             raise ValueError("No content could be scraped from the provided URLs.")
 
-        # Save the scraped contents into a JSON file
         json_file_path = "output.json"
         with open(json_file_path, "w") as json_file:
             json.dump(url_contents, json_file, indent=4)
 
-        # Clean up the JSON file after the response
         background_tasks.add_task(clean_up_file, json_file_path)
         return FileResponse(
             path=json_file_path, filename="output.json", media_type="application/json"
         )
-
     except Exception as e:
         logger.error(f"Error scraping URLs: {e}")
         raise HTTPException(status_code=500, detail="Failed to scrape URLs. Please try again.")
 
-
 async def scrape_single_url(url):
     """
     Scrape a single URL's content using aiohttp.
@@ -153,26 +190,21 @@ async def scrape_single_url(url):
                     error_message = f"Failed to scrape {url}, status code: {response.status}"
                     logger.error(error_message)
                     return url, error_message
-
                 content = await response.text()
                 logger.info(f"Successfully scraped {url}")
                 return url, content
-
     except ClientError as e:
         error_message = f"Network error scraping {url}: {e}"
         logger.error(error_message)
         return url, error_message
-
     except asyncio.TimeoutError:
         error_message = f"Timeout error scraping {url}"
         logger.error(error_message)
         return url, error_message
-
     except Exception as e:
         logger.error(f"Unexpected error scraping {url}: {e}")
         return url, f"Error: {str(e)}"
 
-
 async def clean_up_file(filepath: str):
     """
     Delete a file after use to clean up resources.

diff --git a/app/scraper.py b/app/scraper.py
@@ -7,7 +7,7 @@
 import csv
 from pathlib import Path
 
-from app.config.driver import get_chrome_driver
+from config.driver import get_chrome_driver
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)

diff --git a/packages/pdfextract/__init__.py b/packages/pdfextract/__init__.py
diff --git a/packages/pdfextract/routes.py b/packages/pdfextract/routes.py
@@ -0,0 +1,31 @@
+from fastapi import APIRouter, UploadFile, File, HTTPException
+from pathlib import Path
+import shutil
+from config.file_utils import create_temp_folder  # Import your function to create the directory
+from .services import extract_text_and_images
+
+router = APIRouter()
+
+@router.post("/extract")
+async def upload_pdf(file: UploadFile = File(...)):
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
+
+    # Create the temporary folder if it doesn't exist
+    temp_dir = create_temp_folder('temp')  # You can change the folder name if needed
+
+    # Define the full path to store the uploaded file
+    temp_file_path = Path(temp_dir) / file.filename
+
+    # Save the file to the temp directory
+    with open(temp_file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+
+    try:
+        # Process the PDF file (assuming you have a function to extract text and images)
+        result = extract_text_and_images(str(temp_file_path))
+        return result
+    finally:
+        # Clean up: Delete the uploaded file after processing
+        if temp_file_path.exists():
+            temp_file_path.unlink(missing_ok=True)
diff --git a/packages/pdfextract/services.py b/packages/pdfextract/services.py
@@ -0,0 +1,24 @@
+from PyPDF2 import PdfReader
+from pathlib import Path
+from PIL import Image
+import io
+
+def extract_text_and_images(pdf_path: str):
+    result = {"text": "", "images": []}
+    reader = PdfReader(pdf_path)
+
+    # Extract text
+    for page in reader.pages:
+        result["text"] += page.extract_text() or ""
+
+    # Extract images
+    for page in reader.pages:
+        if hasattr(page, 'images'):
+            for image_file in page.images:
+                img_data = io.BytesIO(image_file.data)
+                img = Image.open(img_data)
+                img_path = Path("extracted_images") / image_file.name
+                img.save(img_path)
+                result["images"].append(str(img_path))
+
+    return result