Skip to content

Commit

Permalink
feat(Database): Aggregate SQLAlchemy-based DB management in wordprofi…
Browse files Browse the repository at this point in the history
…le.db
  • Loading branch information
gremid committed Jan 9, 2025
1 parent 282e82e commit edb07ad
Show file tree
Hide file tree
Showing 11 changed files with 261 additions and 374 deletions.
45 changes: 21 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,40 +73,37 @@ In diesem Aufruf werden die Teilkorpora in `test_wp/colloc` zusammengeführt, di
Mit der Option `--mwe` werden nach der Zusammenführung der Teilkorpora Verkettungen von Kollokationen ("Mehrwortausdrücke") gesucht, d.h. Überlappungen zweier Kollokationen.
### 3. Befüllen der Datenbank
In diesem Schritt werden die Ergebnisse in die Datenbank geschrieben und Indizes auf den Datenbanktabellen erstellt, um eine performante Abfrage der Kollokationen zu ermöglichen.
```sh
usage: load_database.py [-h] [--user USER] [--db DB] source
#### 3.1. Starten einer lokalen Datenbankinstanz
positional arguments:
source temporary storage path
Wenn per Umgebungsvariablen keine anderen Einstellungen vorliegen, wird eine lokale Datenbankinstanz befüllt. Um eine solche in einem zur Verfügung zu stellen, wird per Docker ein entsprechender MariaDB-Container gestartet:
options:
-h, --help show this help message and exit
--user USER database username
--db DB database name
```
Die Daten liegen in Dateiform so vor, dass sie direkt in eine SQL DB geladen werden können. Die Datenbank wird in ein lokales Verzeichnis (`data/db`) geschrieben, das in den Dockercontainer gemountet wird.
Bevor der Dockercontainer gestartet wird, sollte sichergestellt werden, dass dieses Verzeichnis existiert und dem korrekten User gehört, z.B.:
```sh
mkdir -p data/db
```
Der Dockercontainer wird unter dem entsprechenden User gestartet; dazu muss die Umgebungsvariable `USER_GROUP` gesetzt sein:
```sh
export USER_GROUP=$(id -u):$(id -g)
docker compose up db
```
Danach kann der Container gestartet werden mit
```sh
docker compose build # falls Container noch nicht existiert
docker compose up
Die Daten der Instanz befinden sich auf einem Docker-Volume, von wo sie zur Verbringung auf andere Systeme kopiert werden können. Unter GNU/Linux befindet sich das Volume z. B. standardmäßig im Dateisystem unter
`/var/lib/docker/volumes/wordprofile_db/_data`
Zum Entfernen des Containers und seiner Daten nach Erstellung eines Profils, dient das Kommando
``` sh
docker compose down db -v
```
Beispielaufruf:
```shell
python wordprofile/cli/load_database.py test_wp/stats --user wpuser --db test_wp
#### 3.2. Befüllen der Datenbank
Die Daten liegen in Dateiform so vor, dass sie direkt in eine SQL DB
geladen werden können. Hierzu dient folgendes Skript:
```sh
python wordprofile/cli/load_database.py test_wp/stats
```
Mit dem optionalen Parameter `--clear` kann die Datenbank vor einem (erneuten) Befüllen bereinigt werden.
## Vorverarbeitung
Für die Umwandlung von `.tabs`-Dateien nach `.conll` können die Python-Skripte `data_update.py` oder `tabs2conllu.py` verwendet werden (im Verzeichnis `wordprofile/preprocessing/cli/`).
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ profile = "black"
[tool.pytest.ini_options]
testpaths = ["tests"]

[[tool.mypy.overrides]]
module = ["pymysql", "MySQLdb"]
ignore_missing_imports = true

[tool.setuptools.dynamic]
readme = {file = ["README.md"], content-type = "text/markdown"}
version = {attr = "wordprofile.version.__version__"}
Expand Down
20 changes: 4 additions & 16 deletions tests/test_connector.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
import os
import pathlib
from pathlib import Path
from subprocess import check_call
import unittest
from time import sleep

import pytest
import sqlalchemy as sq

import wordprofile.wpse.create as wc
from wordprofile.db import open_db, load_db
from wordprofile.datatypes import Coocc
from wordprofile.wpse.connector import WPConnect
from wordprofile.wpse.mwe_connector import WPMweConnect
from wordprofile.wpse.processing import load_files_into_db


@pytest.fixture(autouse=True, scope="session")
Expand All @@ -24,22 +21,13 @@ def test_db():
check_call(["docker", "compose", "-p", "wp_test", "down", "db", "-v"])


def create_database():
engine = sq.create_engine(
"mysql+pymysql://wp:wp@localhost:3306/wp?charset=utf8mb4&local_infile=1"
)
with engine.connect() as conn:
wc.init_word_profile_tables(conn, "test")
data_dir = pathlib.Path(__file__).parent / "testdata" / "test_db"
load_files_into_db(conn, data_dir)
wc.create_indices(conn)
wc.create_statistics(conn)
db_test_data_dir = Path(__file__).parent / "testdata" / "test_db"


class WPConnectTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
create_database()
load_db(open_db(clear=True), db_test_data_dir)
cls.connector = WPConnect(
host="localhost", user="wp", passwd="wp", dbname="wp"
)
Expand Down
45 changes: 10 additions & 35 deletions wordprofile/cli/load_database.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,24 @@
import logging
import os
from argparse import ArgumentParser

from sqlalchemy import create_engine

from wordprofile.db import open_db, load_db
from wordprofile.utils import configure_logs_to_file
from wordprofile.wpse.create import (
create_indices,
create_statistics,
init_word_profile_tables,
)
from wordprofile.wpse.processing import load_files_into_db


def main():
logger = logging.getLogger(__name__)
configure_logs_to_file(logging.INFO, "load-database")
parser = ArgumentParser()
parser.add_argument("source", help="temporary storage path")
parser.add_argument("--user", type=str, help="database username")
parser.add_argument("--db", type=str, help="database name")
args = parser.parse_args()

wp_user = args.user or os.environ["WP_USER"]
wp_db = args.db or os.environ["WP_DB"]
db_password = os.environ.get("WP_PASSWORD", wp_user)
logger.info("USER: " + wp_user)
logger.info("DB: " + wp_db)
logger.info("init database")
engine = create_engine(
"mysql+pymysql://{}:{}@localhost:3306/wp".format(wp_user, db_password)
parser = ArgumentParser()
parser.add_argument(
"source", help="data source dir"
)
with engine.connect() as conn:
init_word_profile_tables(conn, wp_db)
engine = create_engine(
"mysql+pymysql://{}:{}@localhost/{}?charset=utf8mb4&local_infile=1".format(
wp_user, db_password, wp_db
)
parser.add_argument(
"--clear", help="Clear database before loading", action="store_true"
)
logger.info("CREATE indices")
with engine.connect() as conn:
load_files_into_db(conn, args.source)
create_indices(conn)
logger.info("CREATE word profile stats")
create_statistics(conn)

args = parser.parse_args()

load_db(open_db(clear=args.clear), args.source)


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions wordprofile/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@
MIN_REL_FREQ = config("WP_MIN_REL_FREQ", cast=int, default=3)

MWE = config("WP_MWE", cast=bool, default=False)

max_form_length = 50
192 changes: 192 additions & 0 deletions wordprofile/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import logging

from pathlib import Path

import sqlalchemy as sq

from sqlalchemy import Column, Enum, MetaData, Table, text, types
from sqlalchemy.engine import Connection
from sqlalchemy.sql import func
from sqlalchemy.sql.schema import Index

import wordprofile.config as config
from wordprofile.extract import relation_types, word_classes

logger = logging.getLogger(__name__)

FORM_TYPE = types.VARCHAR(config.max_form_length)
CORPUS_FILE_TYPE = types.Integer
RELATION_TYPE = Enum(relation_types)
TAG_TYPE = Enum(word_classes)

meta = MetaData()
corpus_files = Table(
"corpus_files",
meta,
Column("id", CORPUS_FILE_TYPE),
Column("corpus", types.VARCHAR(50)),
Column("file", types.VARCHAR(200)),
Column("orig", types.Text),
Column("date", types.DateTime),
Column("available", types.Text),
mysql_engine="Aria",
)
concord_sentences = Table(
"concord_sentences",
meta,
Column("corpus_file_id", CORPUS_FILE_TYPE),
Column("sentence_id", types.Integer),
Column("sentence", types.Text),
Column("random_val", types.Float, server_default=func.rand()),
mysql_engine="Aria",
)
matches = Table(
"matches",
meta,
Column("id", types.Integer),
Column("collocation_id", types.Integer),
Column("head_surface", FORM_TYPE),
Column("dep_surface", FORM_TYPE),
Column("head_position", types.Integer),
Column("dep_position", types.Integer),
Column("prep_position", types.Text),
Column("corpus_file_id", CORPUS_FILE_TYPE),
Column("sentence_id", types.Integer),
mysql_engine="Aria",
)
collocations = Table(
"collocations",
meta,
Column("id", types.Integer),
Column("label", RELATION_TYPE),
Column("lemma1", FORM_TYPE),
Column("lemma2", FORM_TYPE),
Column("lemma1_tag", TAG_TYPE),
Column("lemma2_tag", TAG_TYPE),
Column("preposition", FORM_TYPE),
Column("inv", types.Boolean, default=0),
Column("frequency", types.Integer, default=1),
Column("score", types.Float),
mysql_engine="Aria",
)
mwe = Table(
"mwe",
meta,
Column("id", types.Integer),
Column("collocation1_id", types.Integer),
Column("collocation2_id", types.Integer),
Column("label", RELATION_TYPE),
Column("lemma", FORM_TYPE),
Column("lemma_tag", TAG_TYPE),
Column("inv", types.Boolean, default=0),
Column("frequency", types.Integer, default=1),
Column("score", types.Float),
mysql_engine="Aria",
)
mwe_match = Table(
"mwe_match",
meta,
Column("mwe_id", types.Integer),
Column("match1_id", types.Integer),
Column("match2_id", types.Integer),
mysql_engine="Aria",
)
corpus_freqs = Table(
"corpus_freqs",
meta,
Column("label", RELATION_TYPE),
Column("freq", types.Integer),
Index("label_index", "label"),
mysql_engine="Aria",
)
token_freqs = Table(
"token_freqs",
meta,
Column("lemma", FORM_TYPE),
Column("tag", TAG_TYPE),
Column("freq", types.Integer),
Column("surface", FORM_TYPE),
Column("surface_freq", types.Integer),
mysql_engine="Aria",
)

indices = (
Index("corpus_index", corpus_files.c.id, unique=True),
Index("concord_corpus_index", concord_sentences.c.corpus_file_id),
Index("concord_corpus_sentence_index", concord_sentences.c.corpus_file_id, concord_sentences.c.sentence_id),
Index("rand_val", concord_sentences.c.random_val),
Index("matches_index", matches.c.id, unique=True),
Index("matches_corpus_index", matches.c.corpus_file_id),
Index("matches_corpus_sentence_index", matches.c.corpus_file_id, matches.c.sentence_id),
Index("matches_relation_label_index", matches.c.collocation_id),
Index("mwe_index", mwe.c.id, unique=True),
Index("mwe_collocation1_index", mwe.c.collocation1_id),
Index("mwe_match_index", mwe_match.c.mwe_id),
Index("colloc_id", collocations.c.id, unique=True),
Index("colloc_lemma1_index", collocations.c.lemma1),
Index("colloc_lemma1_tag_index", collocations.c.lemma1, collocations.c.lemma1_tag),
Index("colloc_lemma2_tag_index", collocations.c.lemma2, collocations.c.lemma2_tag),
Index("colloc_lemma_index", collocations.c.lemma1, collocations.c.lemma2),
Index("token_freq_lemma", token_freqs.c.lemma),
Index("token_freq_lemma_tag", token_freqs.c.lemma, token_freqs.c.tag)
)


def open_db(create_schema=True, clear=False, **args):
url = "mysql+pymysql://{}:{}@{}/{}?charset=utf8mb4&local_infile=1".format(
config.DB_USER, config.DB_PASSWORD, config.DB_HOST, config.DB_NAME
)
db = sq.create_engine(url, **args)
logger.info("Opening '%s'" % db)
if clear:
logger.info("Clearing '%s'" % db)
with db.connect() as c:
meta.drop_all(c)
if create_schema:
logger.info("Initializing '%s'" % db)
with db.connect() as c:
meta.create_all(c)
return db


loaded_tables = (
"corpus_files",
"concord_sentences",
"collocations",
"token_freqs",
"matches",
"mwe",
"mwe_match"
)


def load_db(db, data_dir):
data_dir = Path(data_dir)
logger.info("Loading '%s'" % data_dir)
with db.connect() as c:
logger.info("Dropping indices")
for index in indices:
index.drop(c)
for table in loaded_tables:
table_file = data_dir / table
if not table_file.exists():
logger.warning("Local file '%s' does not exist." % table_file)
continue
logger.info("Loading table '%s'" % table)
sql = f"LOAD DATA LOCAL INFILE '{table_file}' INTO TABLE {table}"
if table == "concord_sentences":
sql += " (corpus_file_id, sentence_id, sentence)"
sql += ";"
c.execute(text(sql))
logger.info("Creating corpus frequency statistics")
c.execute(text(
"""
INSERT INTO corpus_freqs (label, freq)
SELECT label, SUM(frequency) as freq
FROM collocations c
GROUP BY label
"""
))
logger.info("Creating indices")
for index in indices:
index.create(c)
Loading

0 comments on commit edb07ad

Please sign in to comment.