Skip to content

Commit

Permalink
WATM, IIIF, MQL improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
dirkroorda committed Jul 5, 2024
1 parent 59134cd commit 6a2651a
Show file tree
Hide file tree
Showing 14 changed files with 3,379 additions and 59 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = text-fabric
version = 12.5.2
version = 12.5.3
description = Processor and browser for annotated text corpora
long_description = file README.md
author = Dirk Roorda
Expand Down
Empty file removed test/mql/2
Empty file.
2,941 changes: 2,939 additions & 2 deletions test/mql/mqltest.ipynb

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions tf/browser/ner/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@
dirExists,
APP_CONFIG,
)
from ...core.timestamp import SILENT_D, DEEP
from .sheets import Sheets
from .helpers import findCompile
from .sets import Sets
Expand Down Expand Up @@ -853,18 +854,21 @@ def bakeEntities(self, versionExtension="e"):
dirRemove(newTf)

app.indent(reset=True)
app.info("Creating a dataset with entity nodes ...")

if not silent:
app.info("Creating a dataset with entity nodes ...")

good = modify(
origTf,
newTf,
targetVersion=newVersion,
addTypes=addTypes,
featureMeta=featureMeta,
silent=silent,
silent=DEEP if silent else SILENT_D,
)

app.info("Done")
if not silent:
app.info("Done")

if not good:
return False
Expand Down
116 changes: 113 additions & 3 deletions tf/convert/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from textwrap import dedent

from ..core.helpers import console
from ..core.generic import AttrDict


PRE = "pre"
Expand Down Expand Up @@ -258,9 +259,7 @@ def checkModel(kind, thisModel, verbose):
modelDefault = (
LINE_MODEL_DEFAULT
if kind == LINE
else PAGE_MODEL_DEFAULT
if kind == PAGE
else SECTION_MODEL_DEFAULT
else PAGE_MODEL_DEFAULT if kind == PAGE else SECTION_MODEL_DEFAULT
)
modelSpecs = (
LINE_MODELS if kind == LINE else PAGE_MODELS if kind == PAGE else SECTION_MODELS
Expand Down Expand Up @@ -668,3 +667,114 @@ def lookupSource(cv, cur, tokenAsSlot, specs):
sourceText = (sourceText or "").strip()
source = {feature: sourceText}
cv.feature(targetNode, **source)


def parseIIIF(settings, prod, selector):
"""Parse the iiif yaml file.
We fill in the parameters.
"""

def applySwitches(prod, constants, switches):
for k, v in switches["prod" if prod else "dev"].items():
constants[k] = v

return constants

def substituteConstants(data, macros, constants):
tpd = type(data)

if tpd is str:
for k, v in constants.items():
if len(data) > 1 and data[0] == "!" and data[1:] in macros:
data = macros[data[1:]]

pattern = f"«{k}»"

if type(v) is int and data == pattern:
data = v
break
else:
data = data.replace(pattern, str(v))

return data

if tpd is list:
return [substituteConstants(item, macros, constants) for item in data]

if tpd is dict:
return {
k: substituteConstants(v, macros, constants) for (k, v) in data.items()
}

return data

constants = applySwitches(prod, settings["constants"], settings["switches"])
macros = applySwitches(prod, settings["macros"], settings["switches"])

return AttrDict(
{
x: substituteConstants(xText, macros, constants)
for (x, xText) in settings[selector].items()
}
)


def operationalize(data):
scanInfo = {}

for extraFeat, info in data.items():
nodeType = info["nodeType"]
variables = info["vars"]
urlPattern = info["urlPattern"]

newVars = {}

for name, val in variables.items():
if val.endswith("-1"):
newVal = val[0:-2]
shift = -1
elif val.endswith("+1"):
newVal = val[0:-2]
shift = 1
else:
newVal = val
shift = 0

feat = tuple(newVal.split(".", 1))

if len(feat) == 1:
parent = None
feat = feat[0]
else:
parent, feat = feat

newVars[name] = (parent, feat, shift)

scanInfo.setdefault(nodeType, []).append((extraFeat, urlPattern, newVars))

return scanInfo


def fillinIIIF(data, **kwargs):
tpd = type(data)

if tpd is str:
for k, v in kwargs.items():
pattern = "{" + k + "}"

if type(v) is int and data == pattern:
data = v
break
else:
data = data.replace(pattern, str(v))

return data

if tpd is list:
return [fillinIIIF(item, **kwargs) for item in data]

if tpd is dict:
return {k: fillinIIIF(v, **kwargs) for (k, v) in data.items()}

return data
160 changes: 160 additions & 0 deletions tf/convert/iiif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from ..core.files import (
readYaml,
readJson,
writeJson,
fileOpen,
initTree,
dirExists,
dirCopy,
)
from ..core.helpers import console
from .helpers import parseIIIF, fillinIIIF


class IIIF:
def __init__(self, teiVersion, app, prod=False, silent=False):
self.teiVersion = teiVersion
self.app = app
self.prod = prod
self.silent = silent

F = app.api.F

repoLocation = app.repoLocation
iiifDir = f"{repoLocation}/iiif"
self.logoDir = f"{iiifDir}/logo"
self.manifestDir = (
f"{iiifDir}/manifests/{teiVersion}/{'prod' if prod else 'dev'}"
)
self.thumbDir = (
f"{repoLocation}/{app.context.provenanceSpec['graphicsRelative']}"
)
self.origDir = f"{repoLocation}/scans"
self.reportDir = f"{repoLocation}/report/{teiVersion}"

settings = readYaml(asFile=f"{repoLocation}/programs/iiif.yaml", plain=True)
self.templates = parseIIIF(settings, prod, "templates")

self.getSizes()
self.getPageSeq()
pages = self.pages
folders = [F.folder.v(f) for f in F.otype.s("folder")]
self.folders = folders

self.console("Collections:")

for folder in folders:
n = len(pages[folder])
self.console(f"{folder:>5} with {n:>4} pages")

def console(self, msg, **kwargs):
"""Print something to the output.
This works exactly as `tf.core.helpers.console`
When the silent member of the object is True, the message will be suppressed.
"""
silent = self.silent

if not silent:
console(msg, **kwargs)

def getSizes(self):
prod = self.prod
thumbDir = self.thumbDir
origDir = self.origDir
sizeFile = f"{origDir if prod else thumbDir}/sizes.tsv"

sizeInfo = {}
self.sizeInfo = sizeInfo

maxW, maxH = 0, 0

n = 0

totW, totH = 0, 0

ws, hs = [], []

with fileOpen(sizeFile) as rh:
next(rh)
for line in rh:
fields = line.rstrip("\n").split("\t")
p = fields[0]
(w, h) = (int(x) for x in fields[1:3])
sizeInfo[p] = (w, h)
ws.append(w)
hs.append(h)
n += 1
totW += w
totH += h

if w > maxW:
maxW = w
if h > maxH:
maxH = h

avW = int(round(totW / n))
avH = int(round(totH / n))

devW = int(round(sum(abs(w - avW) for w in ws) / n))
devH = int(round(sum(abs(h - avH) for h in hs) / n))

self.console(f"Maximum dimensions: W = {maxW:>4} H = {maxH:>4}")
self.console(f"Average dimensions: W = {avW:>4} H = {avH:>4}")
self.console(f"Average deviation: W = {devW:>4} H = {devH:>4}")

def getPageSeq(self):
reportDir = self.reportDir
pageSeqFile = f"{reportDir}/pageseq.json"
self.pages = readJson(asFile=pageSeqFile, plain=True)

def genFolder(self, folder):
templates = self.templates
sizeInfo = self.sizeInfo
pages = self.pages
thesePages = pages[folder]

canvasLevel = templates.canvasLevel

items = []

for p in thesePages:
item = {}
w, h = sizeInfo.get(p, (0, 0))

for k, v in canvasLevel.items():
v = fillinIIIF(v, folder=folder, page=p, width=w, height=h)
item[k] = v

items.append(item)

manifestLevel = templates.manifestLevel
manifestDir = self.manifestDir

data = {}

for k, v in manifestLevel.items():
v = fillinIIIF(v, folder=folder)
data[k] = v

data["items"] = items

writeJson(data, asFile=f"{manifestDir}/{folder}.json")

def manifests(self):
folders = self.folders
manifestDir = self.manifestDir
logoDir = self.logoDir

initTree(manifestDir, fresh=True)

for folder in folders:
self.genFolder(folder)

if dirExists(logoDir):
dirCopy(logoDir, f"{manifestDir}/logo")
else:
console(f"Directory with logos not found: {logoDir}", error=True)

self.console(f"IIIF manifests generated in {manifestDir}")
4 changes: 2 additions & 2 deletions tf/convert/makewatm.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,10 +402,10 @@ def doTask_watm(self):

console(f"\tMaking WATM for version {A.version}")

WA = WATM(A, "tei", skipMeta=False, silent=silent)
WA = WATM(A, "tei", skipMeta=False, silent=silent, prod=prod)
WA.makeText()
WA.makeAnno()
WA.writeAll(prod=prod)
WA.writeAll()
WA.testAll()

def doTask_watms(self):
Expand Down
Loading

0 comments on commit 6a2651a

Please sign in to comment.