Skip to content

Commit

Permalink
added pdf to text extraction module in the scrapemore piepline
Browse files Browse the repository at this point in the history
  • Loading branch information
sigdelsanjog committed Nov 26, 2024
1 parent fe4b2e2 commit be9b370
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 40 deletions.
8 changes: 8 additions & 0 deletions app/config/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import os

temp = 'temp'
def create_temp_folder(directory=temp):
"""Check if the temp directory exists, create it if not."""
if not os.path.exists(directory):
os.makedirs(directory)
return directory
110 changes: 71 additions & 39 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from app.config.models import URLRequest, URLResponse, URLListRequest
from app.scraper import fetch_links, write_links_to_csv, extract_unique_categories, extract_unique_pages, extract_unique_tags
from fastapi.responses import FileResponse, JSONResponse
from config.models import URLRequest, URLResponse, URLListRequest
from scraper import fetch_links, write_links_to_csv, extract_unique_categories, extract_unique_pages, extract_unique_tags
import logging
import sys
import os
import json
import asyncio
import aiohttp
from aiohttp import ClientError, ClientTimeout

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from packages.pdfextract.routes import router as pdf_router

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI()

app.include_router(pdf_router, prefix="/pdfextract", tags=["PDF Extraction"])


# Configure CORS
app.add_middleware(
CORSMiddleware,
Expand All @@ -28,91 +35,126 @@
# Define constants
TIMEOUT = 10 # Timeout for each URL request in seconds

@app.post("/scrape-unique-links-in-categories/", response_model=dict)
async def scrape_unique_links_in_categories(request: URLListRequest, background_tasks: BackgroundTasks):
"""
Fetch all URLs from each category page and then scrape their contents concurrently.
"""
try:
if not request.urls:
raise ValueError("No category URLs provided in the request.")
logger.info(f"Scraping the following categories: {request.urls}")

categories_with_links = {}
for category_url in request.urls:
logger.info(f"Fetching links from category URL: {category_url}")

# Fetch all links within the category page only once
category_links = await fetch_links(category_url, set())
logger.info(f"Found {len(category_links)} links in {category_url}")

# Enumerate the links found
urls_to_scrape = [link['link'] for link in category_links if isinstance(link, dict) and 'link' in link]
logger.info(f"URLs to scrape from {category_url}: {urls_to_scrape}")

if not urls_to_scrape:
logger.warning(f"No URLs found to scrape for category: {category_url}")

# Scrape contents of all URLs found within the category concurrently
scraped_contents = await asyncio.gather(
*[scrape_single_url(url) for url in urls_to_scrape],
return_exceptions=True # Capture errors individually
)

# Store results in the dictionary
categories_with_links[category_url] = {}
for result in scraped_contents:
if isinstance(result, tuple) and len(result) == 2:
url, content = result
categories_with_links[category_url][url] = content
logger.info(f"Scraped content from {url}")
elif isinstance(result, Exception):
logger.error(f"Error scraping URL: {result}")

# Check if any content was scraped successfully
if not categories_with_links:
raise ValueError("No content could be scraped from the provided URLs.")

# Save the scraped contents into a JSON file
json_file_path = "output.json"
with open(json_file_path, "w") as json_file:
json.dump(categories_with_links, json_file, indent=4)

# Clean up the JSON file after the response
background_tasks.add_task(clean_up_file, json_file_path)
return FileResponse(
path=json_file_path, filename="output.json", media_type="application/json"
)
except Exception as e:
logger.error(f"Error scraping unique links in categories: {e}")
raise HTTPException(status_code=500, detail="Failed to scrape unique links in categories.")

@app.post("/analyze", response_model=URLResponse)
async def analyze_url(request: URLRequest):
try:
visited_links = set()
all_links = await fetch_links(request.url, visited_links)

# Categorize the links into Pages, Tags, and Categories
pages = extract_unique_pages(all_links)
tags = extract_unique_tags(all_links)
categories = extract_unique_categories(all_links)

# Prepare a flat list of URLs for response
response_urls = []

# Flatten the categories, pages, and tags into individual URL entries
for page in pages:
response_urls.append({"category": "Page", "url": page['link']})

for tag in tags:
response_urls.append({"category": tag['category'], "url": tag['link']})

for category in categories:
response_urls.append({"category": category['category'], "url": category['link']})

# Log the prepared response
logger.info(f"Prepared response: {response_urls}")

return {"urls": response_urls}

except Exception as e:
logger.error(f"Error in /analyze: {e}")
raise HTTPException(status_code=500, detail=str(e))


@app.post("/scrape-links/")
async def scrape_links(request: URLListRequest, background_tasks: BackgroundTasks):
"""
Fetch and scrape links based on the provided list of URLs.
"""
try:
# Extract URLs from the URLListRequest model
urls = request.urls
visited_links = set()

# Log the URLs to be scraped
logger.info(f"Scraping the following URLs: {urls}")

# Fetch links for each URL
all_links = []
for url in urls:
links = await fetch_links(url, visited_links)
all_links.extend(links)

# Write the links to a CSV file
csv_file_path = await write_links_to_csv(all_links)

# Return the CSV file as a downloadable response
background_tasks.add_task(clean_up_file, csv_file_path)
return FileResponse(path=csv_file_path, filename="unique_links.csv", media_type="text/csv")

except Exception as e:
logger.error(f"Error scraping links: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error scraping links: {str(e)}")


@app.post("/scrape-all-urls/")
async def scrape_url_content(request: URLListRequest, background_tasks: BackgroundTasks):
async def scrape_all_urls(request: URLListRequest, background_tasks: BackgroundTasks):
"""
Scrape the contents of multiple URLs concurrently and save them into a JSON file.
"""
try:
# Check if URLs are provided in the request
if not request.urls:
raise ValueError("No URLs provided in the request.")

# Log the list of URLs to be scraped
logger.info(f"Starting to scrape the following URLs: {request.urls}")

# Run scraping tasks concurrently using asyncio.gather
results = await asyncio.gather(
*[scrape_single_url(url) for url in request.urls],
return_exceptions=True # This allows capturing errors individually
return_exceptions=True
)

# Create a dictionary with URL contents, filtering out failed scrapes
url_contents = {}
for result in results:
if isinstance(result, tuple) and len(result) == 2:
Expand All @@ -121,26 +163,21 @@ async def scrape_url_content(request: URLListRequest, background_tasks: Backgrou
elif isinstance(result, Exception):
logger.error(f"Error during scraping: {result}")

# Check if any results were scraped successfully
if not url_contents:
raise ValueError("No content could be scraped from the provided URLs.")

# Save the scraped contents into a JSON file
json_file_path = "output.json"
with open(json_file_path, "w") as json_file:
json.dump(url_contents, json_file, indent=4)

# Clean up the JSON file after the response
background_tasks.add_task(clean_up_file, json_file_path)
return FileResponse(
path=json_file_path, filename="output.json", media_type="application/json"
)

except Exception as e:
logger.error(f"Error scraping URLs: {e}")
raise HTTPException(status_code=500, detail="Failed to scrape URLs. Please try again.")


async def scrape_single_url(url):
"""
Scrape a single URL's content using aiohttp.
Expand All @@ -153,26 +190,21 @@ async def scrape_single_url(url):
error_message = f"Failed to scrape {url}, status code: {response.status}"
logger.error(error_message)
return url, error_message

content = await response.text()
logger.info(f"Successfully scraped {url}")
return url, content

except ClientError as e:
error_message = f"Network error scraping {url}: {e}"
logger.error(error_message)
return url, error_message

except asyncio.TimeoutError:
error_message = f"Timeout error scraping {url}"
logger.error(error_message)
return url, error_message

except Exception as e:
logger.error(f"Unexpected error scraping {url}: {e}")
return url, f"Error: {str(e)}"


async def clean_up_file(filepath: str):
"""
Delete a file after use to clean up resources.
Expand Down
2 changes: 1 addition & 1 deletion app/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import csv
from pathlib import Path

from app.config.driver import get_chrome_driver
from config.driver import get_chrome_driver

# Configure logging
logging.basicConfig(level=logging.INFO)
Expand Down
Empty file added packages/pdfextract/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions packages/pdfextract/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from fastapi import APIRouter, UploadFile, File, HTTPException
from pathlib import Path
import shutil
from config.file_utils import create_temp_folder # Import your function to create the directory
from .services import extract_text_and_images

router = APIRouter()

@router.post("/extract")
async def upload_pdf(file: UploadFile = File(...)):
if not file.filename.endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are allowed.")

# Create the temporary folder if it doesn't exist
temp_dir = create_temp_folder('temp') # You can change the folder name if needed

# Define the full path to store the uploaded file
temp_file_path = Path(temp_dir) / file.filename

# Save the file to the temp directory
with open(temp_file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)

try:
# Process the PDF file (assuming you have a function to extract text and images)
result = extract_text_and_images(str(temp_file_path))
return result
finally:
# Clean up: Delete the uploaded file after processing
if temp_file_path.exists():
temp_file_path.unlink(missing_ok=True)
24 changes: 24 additions & 0 deletions packages/pdfextract/services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from PyPDF2 import PdfReader
from pathlib import Path
from PIL import Image
import io

def extract_text_and_images(pdf_path: str):
result = {"text": "", "images": []}
reader = PdfReader(pdf_path)

# Extract text
for page in reader.pages:
result["text"] += page.extract_text() or ""

# Extract images
for page in reader.pages:
if hasattr(page, 'images'):
for image_file in page.images:
img_data = io.BytesIO(image_file.data)
img = Image.open(img_data)
img_path = Path("extracted_images") / image_file.name
img.save(img_path)
result["images"].append(str(img_path))

return result

0 comments on commit be9b370

Please sign in to comment.