-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added pdf to text extraction module in the scrapemore piepline
- Loading branch information
1 parent
fe4b2e2
commit be9b370
Showing
6 changed files
with
135 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import os | ||
|
||
temp = 'temp' | ||
def create_temp_folder(directory=temp): | ||
"""Check if the temp directory exists, create it if not.""" | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
return directory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from fastapi import APIRouter, UploadFile, File, HTTPException | ||
from pathlib import Path | ||
import shutil | ||
from config.file_utils import create_temp_folder # Import your function to create the directory | ||
from .services import extract_text_and_images | ||
|
||
router = APIRouter() | ||
|
||
@router.post("/extract") | ||
async def upload_pdf(file: UploadFile = File(...)): | ||
if not file.filename.endswith(".pdf"): | ||
raise HTTPException(status_code=400, detail="Only PDF files are allowed.") | ||
|
||
# Create the temporary folder if it doesn't exist | ||
temp_dir = create_temp_folder('temp') # You can change the folder name if needed | ||
|
||
# Define the full path to store the uploaded file | ||
temp_file_path = Path(temp_dir) / file.filename | ||
|
||
# Save the file to the temp directory | ||
with open(temp_file_path, "wb") as buffer: | ||
shutil.copyfileobj(file.file, buffer) | ||
|
||
try: | ||
# Process the PDF file (assuming you have a function to extract text and images) | ||
result = extract_text_and_images(str(temp_file_path)) | ||
return result | ||
finally: | ||
# Clean up: Delete the uploaded file after processing | ||
if temp_file_path.exists(): | ||
temp_file_path.unlink(missing_ok=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from PyPDF2 import PdfReader | ||
from pathlib import Path | ||
from PIL import Image | ||
import io | ||
|
||
def extract_text_and_images(pdf_path: str): | ||
result = {"text": "", "images": []} | ||
reader = PdfReader(pdf_path) | ||
|
||
# Extract text | ||
for page in reader.pages: | ||
result["text"] += page.extract_text() or "" | ||
|
||
# Extract images | ||
for page in reader.pages: | ||
if hasattr(page, 'images'): | ||
for image_file in page.images: | ||
img_data = io.BytesIO(image_file.data) | ||
img = Image.open(img_data) | ||
img_path = Path("extracted_images") / image_file.name | ||
img.save(img_path) | ||
result["images"].append(str(img_path)) | ||
|
||
return result |