Skip to content

Commit

Permalink
feat(api): Support Uploading PowerPoint Files for RAG (#733)
Browse files Browse the repository at this point in the history
* Adds powerpoint (ppt and pptx) support to the files endpoint
  • Loading branch information
CollectiveUnicorn authored Jul 10, 2024
1 parent d1e42d9 commit 612126d
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 2 deletions.
5 changes: 5 additions & 0 deletions src/leapfrogai_api/backend/rag/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
TextLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
UnstructuredExcelLoader,
)
from langchain_core.documents import Document
Expand All @@ -24,6 +25,8 @@
"text/markdown": UnstructuredMarkdownLoader,
"application/msword": Docx2txtLoader,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": Docx2txtLoader,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": UnstructuredPowerPointLoader,
"application/vnd.ms-powerpoint": UnstructuredPowerPointLoader,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": UnstructuredExcelLoader,
"xls:application/vnd.ms-excel": UnstructuredExcelLoader,
}
Expand All @@ -40,6 +43,8 @@
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".xls": "xls:application/vnd.ms-excel",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
".ppt": "application/vnd.ms-powerpoint",
}


Expand Down
2 changes: 1 addition & 1 deletion src/leapfrogai_api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"supabase-py-async >= 2.5.6",
"langchain >= 0.2.1",
"langchain-community >= 0.2.1",
"unstructured[md,xlsx] >= 0.14.2", # Only specify necessary filetypes to prevent package bloat (e.g. 130MB vs 6GB)
"unstructured[md,xlsx,pptx] >= 0.14.2", # Only specify necessary filetypes to prevent package bloat (e.g. 130MB vs 6GB)
"pylibmagic >= 0.5.0", # Resolves issue with libmagic not being bundled with OS - https://github.com/ahupp/python-magic/issues/233, may not be needed after this is merged https://github.com/ahupp/python-magic/pull/294
"python-magic >= 0.4.27",
"openpyxl >= 3.1.5",
Expand Down
Binary file added tests/data/test.pptx
Binary file not shown.
80 changes: 79 additions & 1 deletion tests/integration/api/test_files.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Test the API endpoints for files."""

import os

import pytest
from fastapi import HTTPException, Response, status
from fastapi.testclient import TestClient
Expand Down Expand Up @@ -224,3 +223,82 @@ async def test_excel_file_handling():
get_deleted_response = client.get(f"/openai/v1/files/{file_object.id}")
assert get_deleted_response.status_code == status.HTTP_200_OK
assert get_deleted_response.json() is None, "Deleted file should not be retrievable"


@pytest.mark.asyncio
async def test_powerpoint_file_handling():
"""Test handling of a PowerPoint file including upload, retrieval, and deletion."""
# Path to the test PowerPoint file
pptx_file_path = os.path.join(os.path.dirname(__file__), "../../data/test.pptx")

# Ensure the file exists
assert os.path.exists(
pptx_file_path
), f"Test PowerPoint file not found at {pptx_file_path}"

# Test file loading and splitting
documents = await load_file(pptx_file_path)
assert len(documents) > 0, "No documents were loaded from the PowerPoint file"
assert documents[0].page_content, "The first document has no content"

split_documents = await split(documents)
assert len(split_documents) >= len(documents), "Documents were not split properly"
assert split_documents[0].page_content, "The first split document has no content"

# Test file upload via API
with open(pptx_file_path, "rb") as pptx_file:
response = client.post(
"/openai/v1/files",
files={
"file": (
"test.pptx",
pptx_file,
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
},
data={"purpose": "assistants"},
)

assert (
response.status_code == status.HTTP_200_OK
), f"Failed to upload PowerPoint file: {response.text}"
file_object = FileObject.model_validate(response.json())

# Test file retrieval
get_response = client.get(f"/openai/v1/files/{file_object.id}")
assert (
get_response.status_code == status.HTTP_200_OK
), f"Failed to retrieve file: {get_response.text}"
retrieved_file = FileObject.model_validate(get_response.json())
assert (
retrieved_file.id == file_object.id
), "Retrieved file ID doesn't match uploaded file ID"

# Test file content retrieval
content_response = client.get(f"/openai/v1/files/{file_object.id}/content")
assert (
content_response.status_code == status.HTTP_200_OK
), f"Failed to retrieve file content: {content_response.text}"
assert (
content_response.headers["Content-Type"]
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
assert (
content_response.headers["Content-Disposition"]
== f'attachment; filename="{file_object.filename}"'
)
assert len(content_response.content) > 0, "File content is empty"

# Test file deletion
delete_response = client.delete(f"/openai/v1/files/{file_object.id}")
assert (
delete_response.status_code == status.HTTP_200_OK
), f"Failed to delete file: {delete_response.text}"
assert (
delete_response.json()["deleted"] is True
), "File was not deleted successfully"

# Verify file is no longer retrievable
get_deleted_response = client.get(f"/openai/v1/files/{file_object.id}")
assert get_deleted_response.status_code == status.HTTP_200_OK
assert get_deleted_response.json() is None, "Deleted file should not be retrievable"

0 comments on commit 612126d

Please sign in to comment.