Merge pull request #1 from denys-potapov/main

Upstream Merge 2023-01-001
erpuno · Jan 10, 2023 · c244bae · c244bae
2 parents f3dc3b8 + 9d47cbf
commit c244bae
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -2,23 +2,15 @@
 
 Preview should be available at http://167.99.253.189:8000/docs
 
-## Local run
-
-### Install requirments
-
-    sudo apt-get install postgresql tesseract-ocr tesseract-ocr-all python3-pip uvicorn
-    pip3 install -r requirments.txt
+### Search patterns
 
-### Prepare the DB
+1. *Stop words:* `!KAMAZ` - exclude documents that contain `!KAMAZ`
+2. *Prefix match:* `55*` - search documents that contain words starting from `55`.
 
-    sudo -u postgres psql -c 'create database documents;'
-    psql -d documents -h localhost -U postgres < sql/schema.sql
-
-### Start
+### Supported formats
 
-    TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --reload --host 0.0.0.0
-
-    TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --workers 4 --port 40111
+1. Scanned PDF
+2. Text PDF
 
 ## Endpoints
 
@@ -27,17 +19,19 @@ View the interactive docs http://localhost:40111/docs
 ### Add document
 
     curl -X 'POST' \
-      'http://localhost:40111/documents/' \
+      'http://localhost:40111/documents/?meta=%7B%22color%22%3A%20%22true%22%7D' \
       -H 'accept: application/json' \
       -H 'Content-Type: multipart/form-data' \
-      -F 'file=@sample2.pdf;type=application/pdf'
+      -F 'file=@sample.color.pdf;type=application/pdf'
 
 Response body:
 
     {
       "id": "8070eba4-cc5b-4ba0-8a5e-9f290ef7c7ea",
       "status": "PENDING",
-      "text": ""
+      "meta": {
+        "color": "true"
+      }
     }
 
 ### Search documents
@@ -52,22 +46,63 @@ Response body:
       {
         "id": "1c71246b-1ac4-43fb-b5d7-634aa65e6ad4",
         "status": "OK",
-        "text": "..."
+        "meta": {
+          "color": "true"
+        }
+        "headline": "..."
       },
       {
         "id": "8070eba4-cc5b-4ba0-8a5e-9f290ef7c7ea",
         "status": "OK",
-        "text": "..."
+        "meta": {
+          "color": "true"
+        }
+        "headline": "..."
       }
     ]
 
+### Highlight pages in document
 
-## Deploy
+    curl -X 'GET' \
+      'http://localhost:40111/documents/ea2d3679-de61-4892-b3aa-f62c3a6f68c6/highlights?query=opel%20WOLF%2A' \
+      -H 'accept: application/json'
 
-    sudo systemctl start postgresql.service
+Response body:
 
-To log in without a password:
+    [
+      {
+        "page_number": 2,
+        "headline": "<b>Opel</b> ; Модель: Vivaro;...."
+      }
+    ]
+
+## Local run
+
+### Install requirments
+
+    sudo apt-get install postgresql tesseract-ocr tesseract-ocr-all python3-pip uvicorn
+    pip3 install -r requirments.txt
+
+### Prepare the DB
+
+    psql -d documents -h localhost -U postgres < sql/schema.sql
+
+#### Install ukrainian support
+
+Download two parts of dictionary and add them to postgress:
+
+    wget https://github.com/brown-uk/dict_uk/releases/download/v5.9.0/hunspell-uk_UA_5.9.0.zip
+    unzip hunspell-uk_UA_5.9.0.zip
+    sudo cp uk_UA.aff `pg_config --sharedir`/tsearch_data/uk_UA.affix
+    sudo cp uk_UA.dic `pg_config --sharedir`/tsearch_data/uk_UA.dict
+
+    wget https://raw.githubusercontent.com/brown-uk/dict_uk/v5.9.0/distr/postgresql/ukrainian.stop
+    sudo cp ukrainian.stop `pg_config --sharedir`/tsearch_data/ukrainian.stop
+
+    psql -h localhost -U postgres < sql/uk_ua_search.sql
+
+### Start
 
-    sudo -u postgres psql postgres
-    ALTER USER postgres WITH PASSWORD '11';
+    TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --reload --host 0.0.0.0 --port 40111
 
+    TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/ uvicorn main:app --workers 4 --host 0.0.0.0 --port 40111
diff --git a/db.py b/db.py
@@ -1,48 +1,102 @@
 """DB related."""
+import re
+
+from sqlalchemy import text, bindparam, String, Float
+from sqlalchemy.dialects.postgresql import JSONB
+
 import databases
 
-DATABASE_URL = "postgresql://postgres:11@localhost/documents"
+SEARCH_CONFIG = 'ukrainian'
+
+DATABASE_URL = 'postgresql://postgres:11@localhost/documents'
 
 database = databases.Database(DATABASE_URL)
 
 
 async def get_document(id):
     """Get document by id."""
-    query = "SELECT * FROM documents WHERE id = :id"
-    return await database.fetch_one(query=query, values={"id": id})
+    query = text(
+        "SELECT id, status, meta FROM documents WHERE id = :id"
+    ).bindparams(id=id).columns(id=String, status=String, meta=JSONB)
+
+    return await database.fetch_one(query=query)
 
 
-async def create_empty_document():
+async def create_document(meta):
     """Create new document."""
-    query = "INSERT INTO documents DEFAULT VALUES RETURNING *"
+    query = text(
+        """INSERT INTO documents (meta) VALUES (:meta)
+        RETURNING id, status, meta""").bindparams(
+        bindparam('meta', value=meta, type_=JSONB)).columns(
+        id=String, status=String, meta=JSONB)
+
     return await database.fetch_one(query=query)
 
 
-async def update_document_text(id, text):
+async def update_document_text(document_id, pages: list):
     """Update document text and status."""
-    query = """
-        UPDATE documents
-        SET
-            text = :text,
-            status = :status
-        WHERE
-            id = :id
-    """
-    values = {
-        "id": id,
-        "text": text,
-        "status": "OK"
-    }
-    return await database.execute(query=query, values=values)
-
-
-async def search_documents(plain_query):
+    async with database.transaction():
+        await database.execute_many(
+            query="""INSERT INTO pages (document_id, number, text)
+                VALUES (:document_id, :number, :text)""",
+            values=[{
+                "document_id": document_id,
+                "number": number,
+                "text": text
+            } for number, text in enumerate(pages, start=1)]
+        )
+    await database.execute(
+            query="""UPDATE documents
+                SET text = :text, status = :status
+                WHERE id = :id""",
+            values={
+                "id": document_id,
+                "text": "\n".join(pages),
+                "status": "OK"
+            }
+        )
+
+
+def plainto_tsquery(query):
+    """Convert plain query to tsquery input.
+    Supports ! (not) and * (prefix) modifiers."""
+    words = re.findall(r'!?\w+\*?', query)
+    return " & ".join(words).replace('*', ':*')
+
+
+async def search_documents(plain_query, limit, offset):
     """Search documents."""
-    query = """
-        SELECT *
-        FROM documents
-        WHERE
-            ts @@ plainto_tsquery('simple', :plain_query)"""
+    query = text("""
+        SELECT id, status, meta,
+               ts_headline(:search_config, text, query) AS headline,
+               ts_rank_cd(ts, query) AS rank
+        FROM   documents, to_tsquery(:search_config, :ts_query) query
+        WHERE  query @@ ts
+        ORDER BY rank DESC
+        LIMIT :limit OFFSET :offset;""").bindparams(
+            search_config=SEARCH_CONFIG,
+            ts_query=plainto_tsquery(plain_query),
+            limit=limit,
+            offset=offset
+        ).columns(
+            id=String, status=String, meta=JSONB, headline=String, rank=Float)
+
+    return await database.fetch_all(query=query)
+
 
+async def highlight_document(document_id, plain_query):
+    """Highlights pages in document."""
     return await database.fetch_all(
-        query=query, values={"plain_query": plain_query})
+        query="""
+            SELECT number as page_number,
+                   ts_headline(:search_config, text, query) AS headline
+            FROM   pages, to_tsquery(:search_config, :ts_query) query
+            WHERE  to_tsvector(:search_config, text) @@ query AND
+                   document_id = :document_id
+            ORDER BY number""",
+        values={
+            "search_config": SEARCH_CONFIG,
+            "document_id": document_id,
+            "ts_query": plainto_tsquery(plain_query)
+        }
+    )
diff --git a/main.py b/main.py
@@ -5,23 +5,24 @@
 from fastapi.concurrency import run_in_threadpool
 from fastapi.openapi.utils import get_openapi
 
-from ocr import get_text
+from ocr import get_pages
 import db
 import models
 
+from pydantic import Json
 
 app = FastAPI()
 
 
 def openapi_schema():
-   schema = get_openapi(
-       title="Document search",
-       version="0.1",
-       description="Search documents from any forrmat",
-       routes=app.routes,
-   )
-   app.openapi_schema = schema
-   return app.openapi_schema
+    schema = get_openapi(
+        title="Document search",
+        version="0.1",
+        description="Search documents from any forrmat",
+        routes=app.routes,
+    )
+    app.openapi_schema = schema
+    return app.openapi_schema
 
 
 app.openapi = openapi_schema
@@ -39,8 +40,8 @@ async def shutdown():
 
 async def process_document(document_id, stream: bytes):
     """Processs uploaded document."""
-    text = await run_in_threadpool(lambda: get_text(stream))
-    await db.update_document_text(document_id, text)
+    pages = await run_in_threadpool(lambda: get_pages(stream))
+    await db.update_document_text(document_id, pages)
 
 
 @app.get("/documents/{document_id}", response_model=models.Document)
@@ -49,16 +50,23 @@ async def get_document(
     return await db.get_document(document_id)
 
 
-@app.get("/search", response_model=list[models.Document])
-async def search(query: str):
-    return await db.search_documents(query)
+@app.get(
+    "/documents/{document_id}/highlights",
+    response_model=list[models.Highlight])
+async def highlights(document_id: UUID, query: str):
+    return await db.highlight_document(document_id, query)
+
+
+@app.get("/search", response_model=list[models.SearchResult])
+async def search(query: str, limit: int = 50, offset: int = 0):
+    return await db.search_documents(query, limit, offset)
 
 
 @app.post("/documents/", response_model=models.Document)
 async def create_document(
         background_tasks: BackgroundTasks,
-        file: bytes = File()):
-    document = await db.create_empty_document()
+        meta: Json, file: bytes = File()):
+    document = await db.create_document(meta)
     background_tasks.add_task(process_document, document["id"], file)
 
     return document
diff --git a/models.py b/models.py
@@ -6,5 +6,15 @@
 
 class Document(BaseModel):
     id: UUID
-    status: str = "PENDING"
-    text: str = ''
+    status: str
+    meta: dict
+
+
+class SearchResult(Document):
+    headline: str
+    rank: float
+
+
+class Highlight(BaseModel):
+    page_number: int
+    headline: str
diff --git a/ocr.py b/ocr.py
@@ -9,18 +9,14 @@
 DPI = 300
 
 
-def get_text(stream):
-    """Read document from data bytes."""
-    doc = fitz.open(stream=stream)
-    text = ''
-    for page in doc:
-        ocr = page.get_textpage_ocr(language=LANG, dpi=DPI)
-        text += ocr.extractText()
-
-    return text
+def get_pages(stream):
+    """Read document pages from data bytes."""
+    return [
+        page.get_textpage_ocr(language=LANG, dpi=DPI).extractText()
+        for page in fitz.open(stream=stream)]
 
 
 if __name__ == '__main__':
     print(os.environ['TESSDATA_PREFIX'])
-    preview = get_text(open(sys.argv[1], 'rb').read())
+    preview = get_pages(open(sys.argv[1], 'rb').read())
     print(preview)