feat(Database): Aggregate SQLAlchemy-based DB management in wordprofi…

…le.db
zentrum-lexikographie · Jan 9, 2025 · edb07ad · edb07ad
1 parent 282e82e
commit edb07ad
Show file tree

Hide file tree

Showing 11 changed files with 261 additions and 374 deletions.
diff --git a/README.md b/README.md
@@ -73,40 +73,37 @@ In diesem Aufruf werden die Teilkorpora in `test_wp/colloc` zusammengeführt, di
 Mit der Option `--mwe` werden nach der Zusammenführung der Teilkorpora Verkettungen von Kollokationen ("Mehrwortausdrücke") gesucht, d.h. Überlappungen zweier Kollokationen.
 
 ### 3. Befüllen der Datenbank
+
 In diesem Schritt werden die Ergebnisse in die Datenbank geschrieben und Indizes auf den Datenbanktabellen erstellt, um eine performante Abfrage der Kollokationen zu ermöglichen.
 
-```sh
-usage: load_database.py [-h] [--user USER] [--db DB] source
+#### 3.1. Starten einer lokalen Datenbankinstanz
 
-positional arguments:
-  source       temporary storage path
+Wenn per Umgebungsvariablen keine anderen Einstellungen vorliegen, wird eine lokale Datenbankinstanz befüllt. Um eine solche in einem zur Verfügung zu stellen, wird per Docker ein entsprechender MariaDB-Container gestartet:
 
-options:
-  -h, --help   show this help message and exit
-  --user USER  database username
-  --db DB      database name
-```
-
-Die Daten liegen in Dateiform so vor, dass sie direkt in eine SQL DB geladen werden können. Die Datenbank wird in ein lokales Verzeichnis (`data/db`) geschrieben, das in den Dockercontainer gemountet wird.
-Bevor der Dockercontainer gestartet wird, sollte sichergestellt werden, dass dieses Verzeichnis existiert und dem korrekten User gehört, z.B.:
-```sh
-mkdir -p data/db
-```
-Der Dockercontainer wird unter dem entsprechenden User gestartet; dazu muss die Umgebungsvariable `USER_GROUP` gesetzt sein:
 ```sh
-export USER_GROUP=$(id -u):$(id -g)
+docker compose up db
 ```
-Danach kann der Container gestartet werden mit
-```sh
-docker compose build # falls Container noch nicht existiert
-docker compose up
+
+Die Daten der Instanz befinden sich auf einem Docker-Volume, von wo sie zur Verbringung auf andere Systeme kopiert werden können. Unter GNU/Linux befindet sich das Volume z. B. standardmäßig im Dateisystem unter
+
+`/var/lib/docker/volumes/wordprofile_db/_data`
+
+Zum Entfernen des Containers und seiner Daten nach Erstellung eines Profils, dient das Kommando
+
+``` sh
+docker compose down db -v
 ```
 
-Beispielaufruf:
-```shell
-python wordprofile/cli/load_database.py test_wp/stats --user wpuser --db test_wp
+#### 3.2. Befüllen der Datenbank
+
+Die Daten liegen in Dateiform so vor, dass sie direkt in eine SQL DB
+geladen werden können. Hierzu dient folgendes Skript:
+
+```sh
+python wordprofile/cli/load_database.py test_wp/stats
 ```
 
+Mit dem optionalen Parameter `--clear` kann die Datenbank vor einem (erneuten) Befüllen bereinigt werden.
 
 ## Vorverarbeitung
 Für die Umwandlung von `.tabs`-Dateien nach `.conll` können die Python-Skripte `data_update.py` oder `tabs2conllu.py` verwendet werden (im Verzeichnis `wordprofile/preprocessing/cli/`).

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,10 @@ profile = "black"
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 
+[[tool.mypy.overrides]]
+module = ["pymysql", "MySQLdb"]
+ignore_missing_imports = true
+
 [tool.setuptools.dynamic]
 readme = {file = ["README.md"], content-type = "text/markdown"}
 version = {attr = "wordprofile.version.__version__"}

diff --git a/tests/test_connector.py b/tests/test_connector.py
@@ -1,17 +1,14 @@
 import os
-import pathlib
+from pathlib import Path
 from subprocess import check_call
 import unittest
-from time import sleep
 
 import pytest
-import sqlalchemy as sq
 
-import wordprofile.wpse.create as wc
+from wordprofile.db import open_db, load_db
 from wordprofile.datatypes import Coocc
 from wordprofile.wpse.connector import WPConnect
 from wordprofile.wpse.mwe_connector import WPMweConnect
-from wordprofile.wpse.processing import load_files_into_db
 
 
 @pytest.fixture(autouse=True, scope="session")
@@ -24,22 +21,13 @@ def test_db():
         check_call(["docker", "compose", "-p", "wp_test", "down", "db", "-v"])
 
 
-def create_database():
-    engine = sq.create_engine(
-        "mysql+pymysql://wp:wp@localhost:3306/wp?charset=utf8mb4&local_infile=1"
-    )
-    with engine.connect() as conn:
-        wc.init_word_profile_tables(conn, "test")
-        data_dir = pathlib.Path(__file__).parent / "testdata" / "test_db"
-        load_files_into_db(conn, data_dir)
-        wc.create_indices(conn)
-        wc.create_statistics(conn)
+db_test_data_dir = Path(__file__).parent / "testdata" / "test_db"
 
 
 class WPConnectTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        create_database()
+        load_db(open_db(clear=True), db_test_data_dir)
         cls.connector = WPConnect(
             host="localhost", user="wp", passwd="wp", dbname="wp"
         )

diff --git a/wordprofile/cli/load_database.py b/wordprofile/cli/load_database.py
@@ -1,49 +1,24 @@
 import logging
-import os
 from argparse import ArgumentParser
 
-from sqlalchemy import create_engine
-
+from wordprofile.db import open_db, load_db
 from wordprofile.utils import configure_logs_to_file
-from wordprofile.wpse.create import (
-    create_indices,
-    create_statistics,
-    init_word_profile_tables,
-)
-from wordprofile.wpse.processing import load_files_into_db
 
 
 def main():
-    logger = logging.getLogger(__name__)
     configure_logs_to_file(logging.INFO, "load-database")
-    parser = ArgumentParser()
-    parser.add_argument("source", help="temporary storage path")
-    parser.add_argument("--user", type=str, help="database username")
-    parser.add_argument("--db", type=str, help="database name")
-    args = parser.parse_args()
 
-    wp_user = args.user or os.environ["WP_USER"]
-    wp_db = args.db or os.environ["WP_DB"]
-    db_password = os.environ.get("WP_PASSWORD", wp_user)
-    logger.info("USER: " + wp_user)
-    logger.info("DB: " + wp_db)
-    logger.info("init database")
-    engine = create_engine(
-        "mysql+pymysql://{}:{}@localhost:3306/wp".format(wp_user, db_password)
+    parser = ArgumentParser()
+    parser.add_argument(
+        "source", help="data source dir"
     )
-    with engine.connect() as conn:
-        init_word_profile_tables(conn, wp_db)
-    engine = create_engine(
-        "mysql+pymysql://{}:{}@localhost/{}?charset=utf8mb4&local_infile=1".format(
-            wp_user, db_password, wp_db
-        )
+    parser.add_argument(
+        "--clear", help="Clear database before loading", action="store_true"
     )
-    logger.info("CREATE indices")
-    with engine.connect() as conn:
-        load_files_into_db(conn, args.source)
-        create_indices(conn)
-        logger.info("CREATE word profile stats")
-        create_statistics(conn)
+
+    args = parser.parse_args()
+
+    load_db(open_db(clear=args.clear), args.source)
 
 
 if __name__ == "__main__":

diff --git a/wordprofile/config.py b/wordprofile/config.py
@@ -21,3 +21,5 @@
 MIN_REL_FREQ = config("WP_MIN_REL_FREQ", cast=int, default=3)
 
 MWE = config("WP_MWE", cast=bool, default=False)
+
+max_form_length = 50
diff --git a/wordprofile/db.py b/wordprofile/db.py
@@ -0,0 +1,192 @@
+import logging
+
+from pathlib import Path
+
+import sqlalchemy as sq
+
+from sqlalchemy import Column, Enum, MetaData, Table, text, types
+from sqlalchemy.engine import Connection
+from sqlalchemy.sql import func
+from sqlalchemy.sql.schema import Index
+
+import wordprofile.config as config
+from wordprofile.extract import relation_types, word_classes
+
+logger = logging.getLogger(__name__)
+
+FORM_TYPE = types.VARCHAR(config.max_form_length)
+CORPUS_FILE_TYPE = types.Integer
+RELATION_TYPE = Enum(relation_types)
+TAG_TYPE = Enum(word_classes)
+
+meta = MetaData()
+corpus_files = Table(
+    "corpus_files",
+    meta,
+    Column("id", CORPUS_FILE_TYPE),
+    Column("corpus", types.VARCHAR(50)),
+    Column("file", types.VARCHAR(200)),
+    Column("orig", types.Text),
+    Column("date", types.DateTime),
+    Column("available", types.Text),
+    mysql_engine="Aria",
+)
+concord_sentences = Table(
+    "concord_sentences",
+    meta,
+    Column("corpus_file_id", CORPUS_FILE_TYPE),
+    Column("sentence_id", types.Integer),
+    Column("sentence", types.Text),
+    Column("random_val", types.Float, server_default=func.rand()),
+    mysql_engine="Aria",
+)
+matches = Table(
+    "matches",
+    meta,
+    Column("id", types.Integer),
+    Column("collocation_id", types.Integer),
+    Column("head_surface", FORM_TYPE),
+    Column("dep_surface", FORM_TYPE),
+    Column("head_position", types.Integer),
+    Column("dep_position", types.Integer),
+    Column("prep_position", types.Text),
+    Column("corpus_file_id", CORPUS_FILE_TYPE),
+    Column("sentence_id", types.Integer),
+    mysql_engine="Aria",
+)
+collocations = Table(
+    "collocations",
+    meta,
+    Column("id", types.Integer),
+    Column("label", RELATION_TYPE),
+    Column("lemma1", FORM_TYPE),
+    Column("lemma2", FORM_TYPE),
+    Column("lemma1_tag", TAG_TYPE),
+    Column("lemma2_tag", TAG_TYPE),
+    Column("preposition", FORM_TYPE),
+    Column("inv", types.Boolean, default=0),
+    Column("frequency", types.Integer, default=1),
+    Column("score", types.Float),
+    mysql_engine="Aria",
+)
+mwe = Table(
+    "mwe",
+    meta,
+    Column("id", types.Integer),
+    Column("collocation1_id", types.Integer),
+    Column("collocation2_id", types.Integer),
+    Column("label", RELATION_TYPE),
+    Column("lemma", FORM_TYPE),
+    Column("lemma_tag", TAG_TYPE),
+    Column("inv", types.Boolean, default=0),
+    Column("frequency", types.Integer, default=1),
+    Column("score", types.Float),
+    mysql_engine="Aria",
+)
+mwe_match = Table(
+    "mwe_match",
+    meta,
+    Column("mwe_id", types.Integer),
+    Column("match1_id", types.Integer),
+    Column("match2_id", types.Integer),
+    mysql_engine="Aria",
+)
+corpus_freqs = Table(
+    "corpus_freqs",
+    meta,
+    Column("label", RELATION_TYPE),
+    Column("freq", types.Integer),
+    Index("label_index", "label"),
+    mysql_engine="Aria",
+)
+token_freqs = Table(
+    "token_freqs",
+    meta,
+    Column("lemma", FORM_TYPE),
+    Column("tag", TAG_TYPE),
+    Column("freq", types.Integer),
+    Column("surface", FORM_TYPE),
+    Column("surface_freq", types.Integer),
+    mysql_engine="Aria",
+)
+
+indices = (
+    Index("corpus_index", corpus_files.c.id, unique=True),
+    Index("concord_corpus_index", concord_sentences.c.corpus_file_id),
+    Index("concord_corpus_sentence_index", concord_sentences.c.corpus_file_id, concord_sentences.c.sentence_id),
+    Index("rand_val", concord_sentences.c.random_val),
+    Index("matches_index", matches.c.id, unique=True),
+    Index("matches_corpus_index", matches.c.corpus_file_id),
+    Index("matches_corpus_sentence_index", matches.c.corpus_file_id, matches.c.sentence_id),
+    Index("matches_relation_label_index", matches.c.collocation_id),
+    Index("mwe_index", mwe.c.id, unique=True),
+    Index("mwe_collocation1_index", mwe.c.collocation1_id),
+    Index("mwe_match_index", mwe_match.c.mwe_id),
+    Index("colloc_id", collocations.c.id, unique=True),
+    Index("colloc_lemma1_index", collocations.c.lemma1),
+    Index("colloc_lemma1_tag_index", collocations.c.lemma1, collocations.c.lemma1_tag),
+    Index("colloc_lemma2_tag_index", collocations.c.lemma2, collocations.c.lemma2_tag),
+    Index("colloc_lemma_index", collocations.c.lemma1, collocations.c.lemma2),
+    Index("token_freq_lemma", token_freqs.c.lemma),
+    Index("token_freq_lemma_tag", token_freqs.c.lemma, token_freqs.c.tag)
+)
+
+
+def open_db(create_schema=True, clear=False, **args):
+    url = "mysql+pymysql://{}:{}@{}/{}?charset=utf8mb4&local_infile=1".format(
+        config.DB_USER, config.DB_PASSWORD, config.DB_HOST, config.DB_NAME
+    )
+    db = sq.create_engine(url, **args)
+    logger.info("Opening '%s'" % db)
+    if clear:
+        logger.info("Clearing '%s'" % db)
+        with db.connect() as c:
+            meta.drop_all(c)
+    if create_schema:
+        logger.info("Initializing '%s'" % db)
+        with db.connect() as c:
+            meta.create_all(c)
+    return db
+
+
+loaded_tables = (
+    "corpus_files",
+    "concord_sentences",
+    "collocations",
+    "token_freqs",
+    "matches",
+    "mwe",
+    "mwe_match"
+)
+
+
+def load_db(db, data_dir):
+    data_dir = Path(data_dir)
+    logger.info("Loading '%s'" % data_dir)
+    with db.connect() as c:
+        logger.info("Dropping indices")
+        for index in indices:
+            index.drop(c)
+        for table in loaded_tables:
+            table_file = data_dir / table
+            if not table_file.exists():
+                logger.warning("Local file '%s' does not exist." % table_file)
+                continue
+            logger.info("Loading table '%s'" % table)
+            sql = f"LOAD DATA LOCAL INFILE '{table_file}' INTO TABLE {table}"
+            if table == "concord_sentences":
+                sql += " (corpus_file_id, sentence_id, sentence)"
+            sql += ";"
+            c.execute(text(sql))
+        logger.info("Creating corpus frequency statistics")
+        c.execute(text(
+            """
+            INSERT INTO corpus_freqs (label, freq)
+            SELECT label, SUM(frequency) as freq
+            FROM collocations c
+            GROUP BY label
+            """
+        ))
+        logger.info("Creating indices")
+        for index in indices:
+            index.create(c)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,3 +21,5 @@
		MIN_REL_FREQ = config("WP_MIN_REL_FREQ", cast=int, default=3)

		MWE = config("WP_MWE", cast=bool, default=False)

		max_form_length = 50