app.py

from gpt_index import Document, GPTListIndex
import gradio as gr
import openai
import os
import PyPDF2
import docx
import pytesseract
from PIL import Image

def pdftotext(file_name):
  """
  Function to extract text from .pdf format files
  """

  text = []
  # Open the PDF file in read-binary mode
  with open(file_name, 'rb') as file:
    # Create a PDF object
    pdf = PyPDF2.PdfReader(file)

    # Get the number of pages in the PDF document
    num_pages = len(pdf.pages)

    # Iterate over every page
    for page in range(num_pages):
      # Extract the text from the page
      result = pdf.pages[page].extract_text()
      text.append(result)

  text = "\n".join(text)

  return text

def docxtotext(file_name):
  """
  Function to read .docx format files
  """
  # Open the Word document
  document = docx.Document(file_name)

  # Extract the text from the document
  text = '\n'.join([paragraph.text for paragraph in document.paragraphs])

  return text

def readtextfile(file_name):
  """
  Function to read .txt format files
  """

  # Open the Text document
  with open(file_name, 'r') as file:
    text = file.read()

  return text

def imagetotext(file_name):
  """
  Function to extract text from images
  """
  # Open the image using PIL
  image = Image.open(file_name)

  # Extract the text from the image
  text = pytesseract.image_to_string(image)

  return text

def preprocesstext(text):
  """
  Function to preprocess text
  """
  # Split the string into lines
  lines = text.splitlines()
  # Use a list comprehension to filter out empty lines
  lines = [line for line in lines if line.strip()]
  # Join the modified lines back into a single string
  text = '\n'.join(lines)

  return text

def processfiles(files):
  """
  Function to extract text from documents
  """
  textlist = []

  # Iterate over provided files
  for file in files:
    # Get file name
    file_name = file.name
    # Get extention of file name
    ext = file_name.split(".")[-1].lower()

    # Process document based on extention
    if ext == "pdf":
      text = pdftotext(file_name)
    elif ext == "docx":
      text = docxtotext(file_name)
    elif ext == "txt":
      text = readtextfile(file_name)
    elif ext in ["png", "jpg", "jpeg"]:
      text = imagetotext(file_name)
    else:
      text = ""

    # Preprocess text
    text = preprocesstext(text)

    # Append the text to final result
    textlist.append(text)

  return textlist

def createdocuments(textlist):
  """
  Function to create documents as needed for indexing.
  """
  documents = []
  # Create Document for indexing
  for text in textlist:
    documents.append(Document(text))

  return documents

def fileformatvaliditycheck(files):
  """
  Function to check validity of file formats
  """

  for file1 in files:
    file_name = file1.name
    # Get extention of file name
    ext = file_name.split(".")[-1].lower()

    if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]:
      return False
  return True

def openaiapikeyvaliditycheck(openaikey):
  """
  Function to check validity of openai key
  """
  # Set the API key
  openai.api_key = openaikey
  # Test the API key by making a request to the OpenAI API
  try:
      response = openai.Model.list()
      return "Valid OpenAI API key"
  except openai.OpenAIError:    
    apikeylink = "https://beta.openai.com/account/api-keys"
    return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}"

def createindex(files, openaikey):
  """
  Function to create index
  """

  # Basic Checks
  if not files:
    return "Upload file before proceeding further."

  fileformatvalidity = fileformatvaliditycheck(files)

  if not fileformatvalidity:
    return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only."

  if not openaikey:
    return "Please enter your openai key."

  openaiapikeyvality = openaiapikeyvaliditycheck(openaikey)

  if openaiapikeyvality != "Valid OpenAI API key":
    return openaiapikeyvality

  # Store openai key in environment
  os.environ['OPENAI_API_KEY'] = openaikey

  # Process the Documents
  doctextlist = processfiles(files)
  documents = createdocuments(doctextlist)

  # Create index
  index = GPTListIndex(documents, chunk_size_limit = 3500)
  # Save index
  index.save_to_disk('index.json')

  return "Uploading documents successfully. OpenAI API Key provided is Valid."

def docques(query, openaikey):
  """
  Function to for quering on the index created
  """

  # Store openai key in environment
  os.environ['OPENAI_API_KEY'] = openaikey

  # Load index
  index = GPTListIndex.load_from_disk('index.json')

  # Query based on index
  response = index.query(query, response_mode="tree_summarize")

  return response

def cleartext(query, output):
  """
  Function to clear text
  """
  return ["", ""]

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h1><center><b>DocQues</center></h1>
    
    """)
    gr.Markdown(
    """
    This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your
    <a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n
    """)
    gr.Markdown(
        """
        <br>**Use this space effectively by following below 2 step process.**</br>
        *Step-1*
        <br>- Upload pdf/docx/txt/png/jpeg/jpg format documents. 
        <br>- Enter your openai key.
        <br>- Click upload and wait to see if upload is successful or not. </br>
        *Step-2*
        <br>- Enter your query. 
        <br>- Click submit.
        <br>- Check Answer </br>

        Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> .
        """
    )
    with gr.Row():
      with gr.Column():
        files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple")
        openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.")
        upload_button = gr.Button("Upload")
        query = gr.Textbox(lines = 2, label = "Enter Your Question.")
        submit_button = gr.Button("Submit")
      with gr.Column():
        upload_output = gr.Textbox(label = "Upload/ Error.")
        ans_output = gr.Textbox(label = "Answer.")
        clear_button = gr.Button("Clear")

    # Upload button for uploading files and openai key.
    upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] )

    # Submit button for submitting query.
    submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] )

    # Clear button for clearing query and answer.
    clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] )

demo.launch()