Skip to content

Commit

Permalink
Merge pull request #1 from denys-potapov/main
Browse files Browse the repository at this point in the history
Upstream Merge 2023-01-001
  • Loading branch information
5HT authored Jan 10, 2023
2 parents f3dc3b8 + 9d47cbf commit c244bae
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 83 deletions.
83 changes: 59 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,15 @@

Preview should be available at http://167.99.253.189:8000/docs

## Local run

### Install requirments

sudo apt-get install postgresql tesseract-ocr tesseract-ocr-all python3-pip uvicorn
pip3 install -r requirments.txt
### Search patterns

### Prepare the DB
1. *Stop words:* `!KAMAZ` - exclude documents that contain `!KAMAZ`
2. *Prefix match:* `55*` - search documents that contain words starting from `55`.

sudo -u postgres psql -c 'create database documents;'
psql -d documents -h localhost -U postgres < sql/schema.sql

### Start
### Supported formats

TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --reload --host 0.0.0.0

TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --workers 4 --port 40111
1. Scanned PDF
2. Text PDF

## Endpoints

Expand All @@ -27,17 +19,19 @@ View the interactive docs http://localhost:40111/docs
### Add document

curl -X 'POST' \
'http://localhost:40111/documents/' \
'http://localhost:40111/documents/?meta=%7B%22color%22%3A%20%22true%22%7D' \
-H 'accept: application/json' \
-H 'Content-Type: multipart/form-data' \
-F 'file=@sample2.pdf;type=application/pdf'
-F 'file=@sample.color.pdf;type=application/pdf'

Response body:

{
"id": "8070eba4-cc5b-4ba0-8a5e-9f290ef7c7ea",
"status": "PENDING",
"text": ""
"meta": {
"color": "true"
}
}

### Search documents
Expand All @@ -52,22 +46,63 @@ Response body:
{
"id": "1c71246b-1ac4-43fb-b5d7-634aa65e6ad4",
"status": "OK",
"text": "..."
"meta": {
"color": "true"
}
"headline": "..."
},
{
"id": "8070eba4-cc5b-4ba0-8a5e-9f290ef7c7ea",
"status": "OK",
"text": "..."
"meta": {
"color": "true"
}
"headline": "..."
}
]

### Highlight pages in document

## Deploy
curl -X 'GET' \
'http://localhost:40111/documents/ea2d3679-de61-4892-b3aa-f62c3a6f68c6/highlights?query=opel%20WOLF%2A' \
-H 'accept: application/json'

sudo systemctl start postgresql.service
Response body:

To log in without a password:
[
{
"page_number": 2,
"headline": "<b>Opel</b> ; Модель: Vivaro;...."
}
]

## Local run

### Install requirments

sudo apt-get install postgresql tesseract-ocr tesseract-ocr-all python3-pip uvicorn
pip3 install -r requirments.txt

### Prepare the DB

psql -d documents -h localhost -U postgres < sql/schema.sql

#### Install ukrainian support

Download two parts of dictionary and add them to postgress:

wget https://github.com/brown-uk/dict_uk/releases/download/v5.9.0/hunspell-uk_UA_5.9.0.zip
unzip hunspell-uk_UA_5.9.0.zip
sudo cp uk_UA.aff `pg_config --sharedir`/tsearch_data/uk_UA.affix
sudo cp uk_UA.dic `pg_config --sharedir`/tsearch_data/uk_UA.dict

wget https://raw.githubusercontent.com/brown-uk/dict_uk/v5.9.0/distr/postgresql/ukrainian.stop
sudo cp ukrainian.stop `pg_config --sharedir`/tsearch_data/ukrainian.stop

psql -h localhost -U postgres < sql/uk_ua_search.sql

### Start

sudo -u postgres psql postgres
ALTER USER postgres WITH PASSWORD '11';
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --reload --host 0.0.0.0 --port 40111

TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --workers 4 --host 0.0.0.0 --port 40111
112 changes: 83 additions & 29 deletions db.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,102 @@
"""DB related."""
import re

from sqlalchemy import text, bindparam, String, Float
from sqlalchemy.dialects.postgresql import JSONB

import databases

DATABASE_URL = "postgresql://postgres:11@localhost/documents"
SEARCH_CONFIG = 'ukrainian'

DATABASE_URL = 'postgresql://postgres:11@localhost/documents'

database = databases.Database(DATABASE_URL)


async def get_document(id):
"""Get document by id."""
query = "SELECT * FROM documents WHERE id = :id"
return await database.fetch_one(query=query, values={"id": id})
query = text(
"SELECT id, status, meta FROM documents WHERE id = :id"
).bindparams(id=id).columns(id=String, status=String, meta=JSONB)

return await database.fetch_one(query=query)


async def create_empty_document():
async def create_document(meta):
"""Create new document."""
query = "INSERT INTO documents DEFAULT VALUES RETURNING *"
query = text(
"""INSERT INTO documents (meta) VALUES (:meta)
RETURNING id, status, meta""").bindparams(
bindparam('meta', value=meta, type_=JSONB)).columns(
id=String, status=String, meta=JSONB)

return await database.fetch_one(query=query)


async def update_document_text(id, text):
async def update_document_text(document_id, pages: list):
"""Update document text and status."""
query = """
UPDATE documents
SET
text = :text,
status = :status
WHERE
id = :id
"""
values = {
"id": id,
"text": text,
"status": "OK"
}
return await database.execute(query=query, values=values)


async def search_documents(plain_query):
async with database.transaction():
await database.execute_many(
query="""INSERT INTO pages (document_id, number, text)
VALUES (:document_id, :number, :text)""",
values=[{
"document_id": document_id,
"number": number,
"text": text
} for number, text in enumerate(pages, start=1)]
)
await database.execute(
query="""UPDATE documents
SET text = :text, status = :status
WHERE id = :id""",
values={
"id": document_id,
"text": "\n".join(pages),
"status": "OK"
}
)


def plainto_tsquery(query):
"""Convert plain query to tsquery input.
Supports ! (not) and * (prefix) modifiers."""
words = re.findall(r'!?\w+\*?', query)
return " & ".join(words).replace('*', ':*')


async def search_documents(plain_query, limit, offset):
"""Search documents."""
query = """
SELECT *
FROM documents
WHERE
ts @@ plainto_tsquery('simple', :plain_query)"""
query = text("""
SELECT id, status, meta,
ts_headline(:search_config, text, query) AS headline,
ts_rank_cd(ts, query) AS rank
FROM documents, to_tsquery(:search_config, :ts_query) query
WHERE query @@ ts
ORDER BY rank DESC
LIMIT :limit OFFSET :offset;""").bindparams(
search_config=SEARCH_CONFIG,
ts_query=plainto_tsquery(plain_query),
limit=limit,
offset=offset
).columns(
id=String, status=String, meta=JSONB, headline=String, rank=Float)

return await database.fetch_all(query=query)


async def highlight_document(document_id, plain_query):
"""Highlights pages in document."""
return await database.fetch_all(
query=query, values={"plain_query": plain_query})
query="""
SELECT number as page_number,
ts_headline(:search_config, text, query) AS headline
FROM pages, to_tsquery(:search_config, :ts_query) query
WHERE to_tsvector(:search_config, text) @@ query AND
document_id = :document_id
ORDER BY number""",
values={
"search_config": SEARCH_CONFIG,
"document_id": document_id,
"ts_query": plainto_tsquery(plain_query)
}
)
40 changes: 24 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,24 @@
from fastapi.concurrency import run_in_threadpool
from fastapi.openapi.utils import get_openapi

from ocr import get_text
from ocr import get_pages
import db
import models

from pydantic import Json

app = FastAPI()


def openapi_schema():
schema = get_openapi(
title="Document search",
version="0.1",
description="Search documents from any forrmat",
routes=app.routes,
)
app.openapi_schema = schema
return app.openapi_schema
schema = get_openapi(
title="Document search",
version="0.1",
description="Search documents from any forrmat",
routes=app.routes,
)
app.openapi_schema = schema
return app.openapi_schema


app.openapi = openapi_schema
Expand All @@ -39,8 +40,8 @@ async def shutdown():

async def process_document(document_id, stream: bytes):
"""Processs uploaded document."""
text = await run_in_threadpool(lambda: get_text(stream))
await db.update_document_text(document_id, text)
pages = await run_in_threadpool(lambda: get_pages(stream))
await db.update_document_text(document_id, pages)


@app.get("/documents/{document_id}", response_model=models.Document)
Expand All @@ -49,16 +50,23 @@ async def get_document(
return await db.get_document(document_id)


@app.get("/search", response_model=list[models.Document])
async def search(query: str):
return await db.search_documents(query)
@app.get(
"/documents/{document_id}/highlights",
response_model=list[models.Highlight])
async def highlights(document_id: UUID, query: str):
return await db.highlight_document(document_id, query)


@app.get("/search", response_model=list[models.SearchResult])
async def search(query: str, limit: int = 50, offset: int = 0):
return await db.search_documents(query, limit, offset)


@app.post("/documents/", response_model=models.Document)
async def create_document(
background_tasks: BackgroundTasks,
file: bytes = File()):
document = await db.create_empty_document()
meta: Json, file: bytes = File()):
document = await db.create_document(meta)
background_tasks.add_task(process_document, document["id"], file)

return document
14 changes: 12 additions & 2 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,15 @@

class Document(BaseModel):
id: UUID
status: str = "PENDING"
text: str = ''
status: str
meta: dict


class SearchResult(Document):
headline: str
rank: float


class Highlight(BaseModel):
page_number: int
headline: str
16 changes: 6 additions & 10 deletions ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,14 @@
DPI = 300


def get_text(stream):
"""Read document from data bytes."""
doc = fitz.open(stream=stream)
text = ''
for page in doc:
ocr = page.get_textpage_ocr(language=LANG, dpi=DPI)
text += ocr.extractText()

return text
def get_pages(stream):
"""Read document pages from data bytes."""
return [
page.get_textpage_ocr(language=LANG, dpi=DPI).extractText()
for page in fitz.open(stream=stream)]


if __name__ == '__main__':
print(os.environ['TESSDATA_PREFIX'])
preview = get_text(open(sys.argv[1], 'rb').read())
preview = get_pages(open(sys.argv[1], 'rb').read())
print(preview)
Loading

0 comments on commit c244bae

Please sign in to comment.