Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Metadata extraction using LLM API service #29

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,10 @@ LANGUAGES=mul,eng,nob
# and optionally, to restrict the detection to a subset of languages
# GIELLADETECT_LANGS=nno,nob,eng,swe,fin

# To use a LLM API service, specify at least the base URL of an OpenAI-style API
# LLM_API_URL=http://localhost:8080/
# LLM_API_KEY=
# LLM_MODEL=

# To have Meteor run on a different path (only for stage and prod environments), set
# CUSTOM_PATH=/meteor-custom-path
6 changes: 4 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ async def get_front_page_html(request: Request) -> Response:
"index.html",
{
"request": request,
"root_path": Utils.get_environment_prefix()
"root_path": Utils.get_environment_prefix(),
"backends": Utils.get_available_backends()
}
)

Expand All @@ -78,7 +79,8 @@ def display_error_message_in_template(request: Request, exc: StarletteHTTPExcept
"results": {
'error': str(exc.detail)
},
"root_path": Utils.get_environment_prefix()
"root_path": Utils.get_environment_prefix(),
"backends": Utils.get_available_backends()
},
status_code=exc.status_code
)
1 change: 1 addition & 0 deletions metadata_extract/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Origin(Enum):
COPYRIGHT = 5
RAPPORT_PREFIX = 6
LANGUAGE_MODEL = 7
LLM = 8


class OriginType(TypedDict):
Expand Down
127 changes: 127 additions & 0 deletions metadata_extract/llm_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""The LLM extractor module extracts metadata using an external LLM API service."""

from typing import TypedDict, Optional
import json
import requests
from .candidate import AuthorType, Candidate, Origin
from .metadata import Metadata
from .meteor_document import MeteorDocument
from .registry import PublisherRegistry


class LLMConfig(TypedDict):
"""Configuration for LLM API service"""
api_url: str
api_key: str
model: str


class LLMExtractor:
"""A LLMExtractor object loads a MeteorDocument and fills a Metadata object
by performing a call to an external LLM API service."""

SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \
"cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."
MAX_TOKENS = 1024
TEMPERATURE = 0.0
TIMEOUT = 120

def __init__(self, doc: MeteorDocument,
registry: Optional[PublisherRegistry],
llm_config: LLMConfig):
self._doc = doc
self._registry = registry
self._config = llm_config
self.metadata = Metadata()

def extract_metadata(self) -> None:
doc_json = self._doc.extract_text_as_json()
response = self._llm_request(doc_json)
self._parse_response_to_doc(response)

def _llm_request(self, doc_json: str) -> str:
message = f"{self.INSTRUCTION}\n\n{doc_json}"

if self._config['api_url'].endswith("/"):
url = self._config['api_url'] + "chat/completions"
else:
url = self._config['api_url'] + "/chat/completions"

headers = {
"Content-Type": "application/json"
}

if self._config['api_key']:
headers['Authorization'] = f'Bearer {self._config["api_key"]}'

data = {
"model": self._config['model'],
"messages": [
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": message},
],
"temperature": self.TEMPERATURE,
"max_tokens": self.MAX_TOKENS
}

api_response = requests.post(url,
headers=headers,
json=data,
timeout=self.TIMEOUT)

api_response.raise_for_status()
return str(api_response.json()['choices'][0]['message']['content'])

def _parse_response_to_doc(self, response: str) -> None:
metadata = json.loads(response)

# language
if 'language' in metadata:
self.metadata.add_candidate('language', Candidate(metadata['language'], Origin.LLM))

# title
if 'title' in metadata:
self.metadata.add_candidate('title', Candidate(metadata['title'], Origin.LLM))

# creator
if 'creator' in metadata:
for creator in metadata['creator']:
if ', ' in creator:
lastname, firstname = creator.split(', ', maxsplit=1)
else:
lastname = creator
firstname = ""
author_dict: AuthorType = {"firstname": firstname, "lastname": lastname}
self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM))

# year
if 'year' in metadata:
self.metadata.add_candidate('year', Candidate(int(metadata['year']), Origin.LLM))

# publisher
if 'publisher' in metadata:
for publisher in metadata['publisher']:
publisher_candidate = Candidate(publisher, Origin.LLM)
if self._registry:
publisher_candidate.reg_entries = self._registry.search(publisher)
self.metadata.add_candidate('publisher', publisher_candidate)

# doi - not supported by Meteor

# e-isbn
if 'e-isbn' in metadata:
# This is pretty poor, we just pass the found e-ISBNs (almost never more than one)
# to Meteor directly and let it pick one essentially at random
for e_isbn in metadata['e-isbn']:
self.metadata.add_candidate('ISBN', Candidate(e_isbn, Origin.LLM))

# p-isbn - Meteor isn't interested in printed ISBNs

# e-issn
if 'e-issn' in metadata:
self.metadata.add_candidate('ISSN', Candidate(metadata['e-issn'], Origin.LLM))

# p-issn - Meteor isn't interested in printed ISBNs

# type_coar - not supported by Meteor
5 changes: 5 additions & 0 deletions metadata_extract/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def choose_title(self) -> Optional[CandidateType]:
and not text.has_no_letters(c.value)]
if page_title:
return page_title[0].to_dict()
llm_title = [c for c in self.candidates['title'] if
c.origin == Origin.LLM and isinstance(c.value, str)
and not text.has_no_letters(c.value)]
if llm_title:
return llm_title[0].to_dict()
return None

def choose_publishers(self) -> Optional[CandidateType]:
Expand Down
19 changes: 14 additions & 5 deletions metadata_extract/meteor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .meteor_document import MeteorDocument
from .metadata import Results
from .finder import Finder
from .llm_extractor import LLMConfig, LLMExtractor


class Meteor:
Expand All @@ -24,6 +25,7 @@ def __init__(self, languages: Optional[list[str]] = None) -> None:
self.registry: Optional[PublisherRegistry] = None
ResourceLoader.load(languages)
self.detect_language: Callable[[str], Optional[str]] = Meteor.__default_detect
self.llm_config: Optional[LLMConfig] = None

@staticmethod
def __default_detect(text: str) -> Optional[str]:
Expand All @@ -41,9 +43,16 @@ def set_registry(self, registry: PublisherRegistry) -> None:
def set_language_detection_method(self, detect_language: Callable[[str], str]) -> None:
self.detect_language = detect_language

def run(self, file_path: str) -> Results:
def set_llm_config(self, llm_config: LLMConfig) -> None:
self.llm_config = llm_config

def run(self, file_path: str, backend: Optional[str] = None) -> Results:
with MeteorDocument(file_path) as doc:
finder = Finder(doc, self.registry, self.detect_language)
finder.extract_metadata()
finder.metadata.choose_best()
return finder.metadata.results
extractor: Optional[LLMExtractor | Finder] = None
if backend and backend.lower() == 'llmextractor' and self.llm_config:
extractor = LLMExtractor(doc, self.registry, self.llm_config)
else:
extractor = Finder(doc, self.registry, self.detect_language)
extractor.extract_metadata()
extractor.metadata.choose_best()
return extractor.metadata.results
54 changes: 54 additions & 0 deletions metadata_extract/meteor_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
"""


import json
from pathlib import Path
import re
from types import TracebackType
from typing import Optional, Self, Type
import fitz
import regex
from .page import Page
from .alto_utils import AltoFile

Expand All @@ -19,6 +22,13 @@ class MeteorDocument:
content. MeteorDocuments are context managers, so they can be used in `with` statements.
"""

# text extraction settings for LLM
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1] # pages to analyze: first 8 pages + last 2 pages
THRESHOLD = 100 # paragraphs shorter than this will always be kept
LONG_PARA_PAGES = [0, 1] # on first two pages, some long paragraphs are accepted
LONG_PARA_MAX = 2 # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'} # PDF metadata fields not to include

def __init__(self, file_path: str,
start: int = 5,
end: int = 5):
Expand Down Expand Up @@ -92,3 +102,47 @@ def get_page_object(self, page_number: int) -> Page:
raise ValueError('No PDF file to load page from')
self.page_objects[page_number] = Page(pdf_page=self.pdfdoc.load_page(page_number - 1))
return self.page_objects[page_number]

def extract_text_as_json(self) -> str:
"""Extract text and metadata as a JSON string suitable for a LLM"""

if not self.pdfdoc:
raise ValueError('No PDF document set')

pdfinfo = {}
pages = []

for key in self.pdfdoc.metadata.keys():
if key not in self.PDF_METADATA_SKIP and self.pdfdoc.metadata.get(key):
pdfinfo[key] = self.pdfdoc.metadata.get(key)

for page in self.PAGES:
if page > len(self.pdfdoc) - 2:
continue

texts = []
text = self.pdfdoc[page].get_text(sort=True)
# Use regular expression to split text into paragraphs
# Delimiter: newline(s) followed by an upper case character
paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
long_paragraph_count = 0

for paragraph in paragraphs:
paragraph = " ".join(paragraph.strip().split())

if '.....' in paragraph or '. . . . .' in paragraph:
# looks like a ToC entry, skip it
continue
if len(paragraph) < self.THRESHOLD: # short paragraph, keep it
texts.append(paragraph)
elif page in self.LONG_PARA_PAGES and long_paragraph_count < self.LONG_PARA_MAX:
# allow some long paragraphs on the first two pages
long_paragraph_count += 1
texts.append(paragraph)
else: # must be a long paragraph, skip it
pass
text = '\n'.join(texts)
if text:
pages.append({"page": self.pdfdoc[page].number, "text": text})

return json.dumps({"pdfinfo": pdfinfo, "pages": pages})
22 changes: 14 additions & 8 deletions src/routes/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing import Annotated, Optional

from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Query
from starlette.datastructures import UploadFile
from starlette.requests import Request
from starlette.responses import HTMLResponse, JSONResponse, Response
Expand All @@ -31,17 +31,18 @@ async def post_pdf_html(
form = await request.form()
file_input = form.get('fileInput')
file_url = form.get('fileUrl')
backend = str(form.get('backend'))

if file_url != "" and isinstance(file_url, str):
utils.verify_url(file_url)
filename: Optional[str] = file_url
filepath = utils.download_file(file_url)
results = utils.process_and_remove(filename, filepath)
results = utils.process_and_remove(filename, filepath, backend=backend)
elif file_input is not None and isinstance(file_input, UploadFile):
utils.verify_file(file_input)
filename = file_input.filename
filepath = utils.save_file(file_input)
results = utils.process_and_remove(filename, filepath)
results = utils.process_and_remove(filename, filepath, backend=backend)
else:
raise HTTPException(400)
return templates.TemplateResponse(
Expand All @@ -53,7 +54,8 @@ async def post_pdf_html(
"filepath": filepath,
"filename": filename,
"results": results,
"root_path": utils.get_environment_prefix()
"root_path": utils.get_environment_prefix(),
"backends": utils.get_available_backends()
}
)

Expand All @@ -68,15 +70,18 @@ async def post_pdf_json(
form = await request.form()
file_input = form.get('fileInput')
file_url = form.get('fileUrl')
backend = str(form.get('backend'))

if file_url != "" and isinstance(file_url, str):
utils.verify_url(file_url)
filepath = utils.download_file(file_url)
results = utils.process_and_remove(file_url, filepath, delete_immediately=True)
results = utils.process_and_remove(
file_url, filepath, backend=backend, delete_immediately=True)
elif file_input is not None and isinstance(file_input, UploadFile):
utils.verify_file(file_input)
filepath = utils.save_file(file_input)
results = utils.process_and_remove(file_input.filename, filepath, delete_immediately=True)
results = utils.process_and_remove(
file_input.filename, filepath, backend=backend, delete_immediately=True)
else:
raise HTTPException(400)
return JSONResponse(results)
Expand All @@ -85,13 +90,14 @@ async def post_pdf_json(
@router.get("/file/{file_name}", response_class=JSONResponse, status_code=200)
async def get_metadata_from_file_on_disk(
file_name: str,
conf: Annotated[Settings, Depends(get_settings)]
conf: Annotated[Settings, Depends(get_settings)],
backend: Optional[str] = Query(None) # Define the optional query parameter
) -> JSONResponse:
"""
Extract metadata from a file on disk and return it as JSON
"""
try:
results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name)
results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name, backend=backend)
except Exception:
return JSONResponse({"error": f"Error while processing {file_name}"})
return JSONResponse(results)
3 changes: 3 additions & 0 deletions src/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class Settings(BaseSettings):
USE_GIELLADETECT: bool = False
GIELLADETECT_LANGS: str = ""
CUSTOM_PATH: str = ""
LLM_API_URL: str = ""
LLM_API_KEY: str = ""
LLM_MODEL: str = ""


settings = Settings()
Expand Down
Loading