diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..4468690
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+select = C,E,F,W,B,B950
+ignore = E203, E501, W503
+builtins = C,E,Eall,Es,F,Fall,Fs,L,N,S,T,TF
diff --git a/programs/remains.ipynb b/programs/remains.ipynb
index dd64aa3..8cc6907 100644
--- a/programs/remains.ipynb
+++ b/programs/remains.ipynb
@@ -41,29 +41,34 @@
}
],
"source": [
- "TF.info('Writing {} trees'.format(rootType))\n",
- "treeFile = '{}/trees-BHSA.txt'.format(OUTPUTDIR)\n",
- "with open(treeFile, 'w') as trees:\n",
- " verseLabel = ''\n",
+ "TF.info(\"Writing {} trees\".format(rootType))\n",
+ "treeFile = \"{}/trees-BHSA.txt\".format(OUTPUTDIR)\n",
+ "with open(treeFile, \"w\") as trees:\n",
+ " verseLabel = \"\"\n",
" s = 0\n",
" chunk = 10000\n",
" sc = 0\n",
" for node in F.otype.s(rootType):\n",
- " if node in skip: continue\n",
- " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTag, rev=False, leafNumbers=False)\n",
- " trees.write('\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n'.format(\n",
- " '{} {}:{}'.format(*T.sectionFromNode(node)), \n",
- " node,\n",
- " bSlot, \n",
- " wordsRep,\n",
- " treeRep,\n",
- " ))\n",
+ " if node in skip:\n",
+ " continue\n",
+ " (treeRep, wordsRep, bSlot) = tree.writeTree(\n",
+ " node, \"r\", getTag, rev=False, leafNumbers=False\n",
+ " )\n",
+ " trees.write(\n",
+ " \"\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n\".format(\n",
+ " \"{} {}:{}\".format(*T.sectionFromNode(node)),\n",
+ " node,\n",
+ " bSlot,\n",
+ " wordsRep,\n",
+ " treeRep,\n",
+ " )\n",
+ " )\n",
" s += 1\n",
" sc += 1\n",
" if sc == chunk:\n",
" TF.info(\"{} trees written\".format(s))\n",
" sc = 0\n",
- "TF.info('{} trees written to {}'.format(s, treeFile))"
+ "TF.info(\"{} trees written to {}\".format(s, treeFile))"
]
},
{
@@ -130,29 +135,34 @@
}
],
"source": [
- "TF.info('Writing {} trees'.format(rootType))\n",
- "treeFile = '{}/trees-BHSA-nodes.txt'.format(OUTPUTDIR)\n",
- "with open(treeFile, 'w') as trees:\n",
- " verseLabel = ''\n",
+ "TF.info(\"Writing {} trees\".format(rootType))\n",
+ "treeFile = \"{}/trees-BHSA-nodes.txt\".format(OUTPUTDIR)\n",
+ "with open(treeFile, \"w\") as trees:\n",
+ " verseLabel = \"\"\n",
" s = 0\n",
" chunk = 10000\n",
" sc = 0\n",
" for node in F.otype.s(rootType):\n",
- " if node in skip: continue\n",
- " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTagN, rev=False, leafNumbers=False)\n",
- " trees.write('\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n'.format(\n",
- " '{} {}:{}'.format(*T.sectionFromNode(node)), \n",
- " node,\n",
- " bSlot, \n",
- " wordsRep,\n",
- " treeRep,\n",
- " ))\n",
+ " if node in skip:\n",
+ " continue\n",
+ " (treeRep, wordsRep, bSlot) = tree.writeTree(\n",
+ " node, \"r\", getTagN, rev=False, leafNumbers=False\n",
+ " )\n",
+ " trees.write(\n",
+ " \"\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n\".format(\n",
+ " \"{} {}:{}\".format(*T.sectionFromNode(node)),\n",
+ " node,\n",
+ " bSlot,\n",
+ " wordsRep,\n",
+ " treeRep,\n",
+ " )\n",
+ " )\n",
" s += 1\n",
" sc += 1\n",
" if sc == chunk:\n",
" TF.info(\"{} trees written\".format(s))\n",
" sc = 0\n",
- "TF.info('{} trees written to {}'.format(s, treeFile))"
+ "TF.info(\"{} trees written to {}\".format(s, treeFile))"
]
},
{
@@ -243,23 +253,32 @@
" vNode = T.nodeFromSection(passage)\n",
" return L.d(vNode, otype=rootType)\n",
"\n",
+ "\n",
"def showcases(cases, oFile):\n",
- " with open(oFile, 'w') as out:\n",
+ " with open(oFile, \"w\") as out:\n",
" for (sNode, caseText) in cases.items():\n",
- " out.write('\\n====================\\n{}\\n{}\\n{} TF-node={}:\\n'.format(\n",
- " '{} {}:{}'.format(*T.sectionFromNode(sNode)),\n",
- " caseText, \n",
- " rootType, \n",
- " sNode,\n",
- " ))\n",
- " for kind in ('e', 'r'):\n",
- " out.write('\\nTree based on slot embedding {}\\n\\n'.format(\n",
- " 'only' if kind == 'e' else ' and mother+clause_constituent relation'\n",
- " ))\n",
- " (treeRep, wordsRep, bSlot) = tree.writeTree(sNode, kind, getTag, rev=False, leafNumbers=False)\n",
- " out.write('{}\\n\\n{}\\n'.format(wordsRep, treeRep))\n",
- " out.write('\\nDepth={}\\n'.format(tree.depth(sNode, kind)))\n",
- " out.write(tree.debugWriteTree(sNode, kind, legenda=kind=='r'))"
+ " out.write(\n",
+ " \"\\n====================\\n{}\\n{}\\n{} TF-node={}:\\n\".format(\n",
+ " \"{} {}:{}\".format(*T.sectionFromNode(sNode)),\n",
+ " caseText,\n",
+ " rootType,\n",
+ " sNode,\n",
+ " )\n",
+ " )\n",
+ " for kind in (\"e\", \"r\"):\n",
+ " out.write(\n",
+ " \"\\nTree based on slot embedding {}\\n\\n\".format(\n",
+ " \"only\"\n",
+ " if kind == \"e\"\n",
+ " else \" and mother+clause_constituent relation\"\n",
+ " )\n",
+ " )\n",
+ " (treeRep, wordsRep, bSlot) = tree.writeTree(\n",
+ " sNode, kind, getTag, rev=False, leafNumbers=False\n",
+ " )\n",
+ " out.write(\"{}\\n\\n{}\\n\".format(wordsRep, treeRep))\n",
+ " out.write(\"\\nDepth={}\\n\".format(tree.depth(sNode, kind)))\n",
+ " out.write(tree.debugWriteTree(sNode, kind, legenda=kind == \"r\"))"
]
},
{
@@ -280,28 +299,30 @@
"source": [
"# below holds for etcbc3, in etcbc4 we have less problem cases\n",
"\n",
- "problem_desc = collections.OrderedDict((\n",
- " (1131739, \"debug reorder\"),\n",
- " (1131712, \"interesting\"), \n",
- " (1131701, \"interesting\"),\n",
- " (1140469, \"subject clause order\"),\n",
- " (passageRoots(('Genesis', 1, 16))[0], \"interesting\"), \n",
- " (1164864, \"interesting\"),\n",
- " (1143081, \"cyclic mothers\"),\n",
- " (1153973, \"cyclic mothers\"),\n",
- " (1158971, \"cyclic mothers\"),\n",
- " (1158971, \"cyclic mothers\"),\n",
- " (1160416, \"cyclic mothers\"),\n",
- " (1160464, \"cyclic mothers\"),\n",
- " (1161141, \"nested cyclic mothers: C.coor => C.attr => P below first C.coor\"), \n",
- " (1163666, \"cyclic mothers\"), \n",
- " (1164830, \"cyclic mothers\"), \n",
- " (1167680, \"cyclic mothers\"), \n",
- " (1170057, \"cyclic mothers\"), \n",
- " (1193065, \"cyclic mothers\"), \n",
- " (1199681, \"cyclic mothers\"), \n",
- " (1199682, \"mother points outside sentence\"),\n",
- "))\n",
+ "problem_desc = collections.OrderedDict(\n",
+ " (\n",
+ " (1131739, \"debug reorder\"),\n",
+ " (1131712, \"interesting\"),\n",
+ " (1131701, \"interesting\"),\n",
+ " (1140469, \"subject clause order\"),\n",
+ " (passageRoots((\"Genesis\", 1, 16))[0], \"interesting\"),\n",
+ " (1164864, \"interesting\"),\n",
+ " (1143081, \"cyclic mothers\"),\n",
+ " (1153973, \"cyclic mothers\"),\n",
+ " (1158971, \"cyclic mothers\"),\n",
+ " (1158971, \"cyclic mothers\"),\n",
+ " (1160416, \"cyclic mothers\"),\n",
+ " (1160464, \"cyclic mothers\"),\n",
+ " (1161141, \"nested cyclic mothers: C.coor => C.attr => P below first C.coor\"),\n",
+ " (1163666, \"cyclic mothers\"),\n",
+ " (1164830, \"cyclic mothers\"),\n",
+ " (1167680, \"cyclic mothers\"),\n",
+ " (1170057, \"cyclic mothers\"),\n",
+ " (1193065, \"cyclic mothers\"),\n",
+ " (1199681, \"cyclic mothers\"),\n",
+ " (1199682, \"mother points outside sentence\"),\n",
+ " )\n",
+ ")\n",
"fixedSample = (\n",
" 1167680,\n",
" 1167152,\n",
@@ -330,23 +351,19 @@
"motherKeys = list(sorted(tree.mother))\n",
"for s in range(20):\n",
" r = random.randint(0, len(motherKeys) - 1)\n",
- " sNode = tree.getRoot(tree.mother[motherKeys[r]], 'e')[0]\n",
- " sample[sNode] = 'random sample in {}s with {}s with mothers'.format(rootType, clauseType)\n",
+ " sNode = tree.getRoot(tree.mother[motherKeys[r]], \"e\")[0]\n",
+ " sample[sNode] = \"random sample in {}s with {}s with mothers\".format(\n",
+ " rootType, clauseType\n",
+ " )\n",
"for sNode in fixedSample:\n",
- " fSample[sNode] = 'random sample in {}s with {}s with mothers'.format(rootType, clauseType)\n",
+ " fSample[sNode] = \"random sample in {}s with {}s with mothers\".format(\n",
+ " rootType, clauseType\n",
+ " )\n",
"\n",
- "#showcases(problemDesc, 'tree-notabene.txt')\n",
- "#showcases(sample, '{}/trees-{}-random-{}.txt'.format(OUTPUTDIR, VERSION, sampleSize))\n",
- "#showcases(fsample, 'trees-fixed-{}.txt'.format(len(fsample)))"
+ "# showcases(problemDesc, 'tree-notabene.txt')\n",
+ "# showcases(sample, '{}/trees-{}-random-{}.txt'.format(OUTPUTDIR, VERSION, sampleSize))\n",
+ "# showcases(fsample, 'trees-fixed-{}.txt'.format(len(fsample)))"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bf8dbbdf-8700-4528-864e-978c3db635e1",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/programs/trees.ipynb b/programs/trees.ipynb
index 0dcfa64..9d4febb 100644
--- a/programs/trees.ipynb
+++ b/programs/trees.ipynb
@@ -25,7 +25,7 @@
"This notebook composes syntax trees out of the\n",
"[BHSA](https://etcbc.github.io/bhsa/) dataset of the Hebrew Bible, its text and it linguistic annotations.\n",
"\n",
- "The source data is the \n",
+ "The source data is the\n",
"[text-fabric](https://github.com/Dans-labs/text-fabric/wiki) representation of this dataset.\n",
"\n",
"The result is a set of roughly 65,000 tree structures, one for each sentence, in\n",
@@ -67,8 +67,8 @@
"The process of tree construction is not straightforward,\n",
"since the BHSA data have not been coded as syntax trees.\n",
"Rather they take the shape of a collection of features that describe\n",
- "observable characteristics of the words, phrases, clauses and sentences. \n",
- "Moreover, if a phrase, clause or sentence is discontinuous, \n",
+ "observable characteristics of the words, phrases, clauses and sentences.\n",
+ "Moreover, if a phrase, clause or sentence is discontinuous,\n",
"it is divided in *phrase_atoms*, *clause_atoms*,\n",
"or *sentence_atoms*, respectively, which are by definition continuous.\n",
"\n",
@@ -81,11 +81,11 @@
"This notebook makes use of a Python module `tree.py` (in the same directory).\n",
"This module works on top of Text-Fabric and knows the general structure of an ancient text.\n",
"It constructs a hierarchy of words, subphrases, phrases, clauses and sentences\n",
- "based on the embedding relationship. \n",
+ "based on the embedding relationship.\n",
"\n",
- "But this is not all. \n",
+ "But this is not all.\n",
"The BHSA data contains a *mother* relationship,\n",
- "which denotes linguistic dependency. \n",
+ "which denotes linguistic dependency.\n",
"The module `trees.py` reconstructs the tree obtained from the embedding relationship\n",
"by using the mother relationship as a set of instructions to move certain nodes below others.\n",
"In some cases extra nodes will be constructed as well."
@@ -103,39 +103,39 @@
"metadata": {},
"source": [
"### Nodes:\n",
- "The BHSA data is coded in such a way that every node is associated with a *type* and a *slot set*. \n",
+ "The BHSA data is coded in such a way that every node is associated with a *type* and a *slot set*.\n",
"\n",
- "The *type* of a node, $T(O)$, determines which features a node has. \n",
+ "The *type* of a node, $T(O)$, determines which features a node has.\n",
"BHSA types are `sentence`, `sentence_atom`,\n",
"`clause`, `clause_atom`, `phrase`, `phrase_atom`, `subphrase`, `word`, and there are also\n",
"the non-linguistic types `book`, `chapter`, `verse` and `half_verse`.\n",
"\n",
- "There is an implicit *ordering of node types*, given by the sequence above, where `word` comes first and \n",
+ "There is an implicit *ordering of node types*, given by the sequence above, where `word` comes first and\n",
"`sentence` comes last. We denote this ordering by $<$.\n",
"\n",
"The *slot set* of a node, $m(O)$, is the set of word occurrences linked to that node.\n",
"Every word occurrence in the source occupies a unique slot, which is a number, so slot sets are sets of numbers.\n",
"Think of the slots as the textual positions of individual words throughout the whole text.\n",
"\n",
- "Note that when a sentence contains a clause which contains a phrase, \n",
+ "Note that when a sentence contains a clause which contains a phrase,\n",
"the sentence, clause, and phrase are linked to slot sets that contain each other.\n",
"The fact that a sentence \"contains\" a clause is not marked directly,\n",
"it is a consequence of how the slot sets they are linked to are embedded.\n",
"\n",
"### Definition (slot set order):\n",
- "There is a \n",
+ "There is a\n",
"[natural order](https://github.com/Dans-labs/text-fabric/wiki/Api#sorting-nodes)\n",
"on slot sets, which we will use.\n",
"\n",
- "We will not base our trees on *all* node types, \n",
+ "We will not base our trees on *all* node types,\n",
"since in the BHSA data they do not constitute a single hierarchy.\n",
"We will restrict ourselves to the set $\\cal O = \\{$ ``sentence``, ``clause``, ``phrase``, ``word`` $\\}$.\n",
"\n",
"### Definition (directly below):\n",
- "Node type $T_1$ \n",
- "is *directly below* \n",
- "$T_2$ ( $T_1 <_1 T_2 $ ) in $\\cal O$ \n",
- "if $T_1 < T_2$ \n",
+ "Node type $T_1$\n",
+ "is *directly below*\n",
+ "$T_2$ ( $T_1 <_1 T_2 $ ) in $\\cal O$\n",
+ "if $T_1 < T_2$\n",
"and there is no $T$ in $\\cal O$ with\n",
"$T_1 < T < T_2$.\n",
"\n",
@@ -144,7 +144,7 @@
"\n",
"### Definition (parent)\n",
"Node $A$ is a parent of node $B$ if the following are true:\n",
- "1. $m(A) \\subseteq\\ m(B)$ \n",
+ "1. $m(A) \\subseteq\\ m(B)$\n",
"2. $T(A) <_1 T(B)$ in $\\cal O$."
]
},
@@ -159,9 +159,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "While using the embedding got us trees, \n",
+ "While using the embedding got us trees,\n",
"using the mother relationship will give us more interesting trees.\n",
- "In general, the *mother* in the BHSA dataset points to a node \n",
+ "In general, the *mother* in the BHSA dataset points to a node\n",
"on which the node in question is, in some sense, dependent.\n",
"The nature of this dependency is coded in a specific feature on clauses,\n",
"the `clause_constituent_relation` in version 3,\n",
@@ -179,28 +179,28 @@
"\n",
"In case 3 we do nothing.\n",
"\n",
- "In case 1 we remove the link of the clause to its parent \n",
+ "In case 1 we remove the link of the clause to its parent\n",
"and add the clause as a child to either the node\n",
- "that the mother points to, or to the parent of the mother. \n",
+ "that the mother points to, or to the parent of the mother.\n",
"We do the latter only if the mother is a word.\n",
"We will not add children to words.\n",
"\n",
- "In the diagrams, the red arrows represent the mother relationship, \n",
- "and the black arrows the embedding relationships, \n",
- "and the fat black arrows the new parent relationships. \n",
+ "In the diagrams, the red arrows represent the mother relationship,\n",
+ "and the black arrows the embedding relationships,\n",
+ "and the fat black arrows the new parent relationships.\n",
"The gray arrows indicated severed parent links.\n",
"\n",
"
\n",
"\n",
"In case 2 we create a node between the mother and its parent.\n",
- "This node takes the name of the mother, and the mother will be added as child, \n",
+ "This node takes the name of the mother, and the mother will be added as child,\n",
"but with name ``Ccoor``, and the clause which points to the mother is added as a sister.\n",
"\n",
- "This is a rather complicated case, but the intuition is not that difficult. \n",
+ "This is a rather complicated case, but the intuition is not that difficult.\n",
"Consider the sentence:\n",
"\n",
" John thinks that Mary said it and did it\n",
- " \n",
+ "\n",
"We have a compound object sentence, with ``Mary said it`` and ``did it`` as coordinated components.\n",
"The way this has been marked up in the BHSA database is as follows:\n",
"\n",
@@ -208,47 +208,47 @@
"\n",
"``and did it``, clause with ``clause_constituent_relation``=``Coor``, ``mother``=``Mary said it``(clause)\n",
"\n",
- "So the second coordinated clause is simply linked to the first coordinated clause. \n",
- "Restructuring means to create a parent for both coordinated clauses \n",
- "and treat both as sisters at the same hierarchical level. \n",
+ "So the second coordinated clause is simply linked to the first coordinated clause.\n",
+ "Restructuring means to create a parent for both coordinated clauses\n",
+ "and treat both as sisters at the same hierarchical level.\n",
"See the diagram.\n",
"\n",
"
\n",
"\n",
"### Note on order\n",
- "When we add nodes to new parents, we let them occupy the sequential position \n",
+ "When we add nodes to new parents, we let them occupy the sequential position\n",
"among its new sisters that corresponds with the slot set ordering.\n",
"\n",
"### Note on discontinuity\n",
- "Sentences, clauses and phrases are not always continuous. \n",
+ "Sentences, clauses and phrases are not always continuous.\n",
"Before restructuring it will not always be the case that if you\n",
- "walk the tree in pre-order, you will end up with the leaves (the words) \n",
+ "walk the tree in pre-order, you will end up with the leaves (the words)\n",
"in the same order as the original sentence.\n",
- "Restructuring generally improves that, because it often puts \n",
- "a node under a non-continuous parent object precisely at the location \n",
+ "Restructuring generally improves that, because it often puts\n",
+ "a node under a non-continuous parent object precisely at the location\n",
"that corresponds with the a gap in the parent.\n",
"\n",
"However, there is no guarantee that every discontinuity will be resolved in this graceful manner.\n",
- "When we create the trees, we also output the list of slot numbers \n",
+ "When we create the trees, we also output the list of slot numbers\n",
"that you get when you walk the tree in pre-order.\n",
"Whenever this list is not monotonic, there is an issue with the ordering.\n",
"\n",
"### Note on cycles\n",
- "If a mother points to itself or a descendant of itself, we have a cycle in the mother relationship. \n",
- "In these cases, the restructuring algorithm will disconnect a parent link \n",
- "without introducing a new link to the tree above it: \n",
+ "If a mother points to itself or a descendant of itself, we have a cycle in the mother relationship.\n",
+ "In these cases, the restructuring algorithm will disconnect a parent link\n",
+ "without introducing a new link to the tree above it:\n",
"a whole fragment of the tree becomes disconnected and will get lost.\n",
"\n",
- "Sanity check 6 below reveals that this occurs in fact 4 times in the BHSA version 4 \n",
- "(it occurred 13 times in the BHSA 3 version). \n",
+ "Sanity check 6 below reveals that this occurs in fact 4 times in the BHSA version 4\n",
+ "(it occurred 13 times in the BHSA 3 version).\n",
"We will exclude these trees from further processing.\n",
"\n",
"### Note on stretch\n",
- "If a mother points outside the sentence of the clause \n",
+ "If a mother points outside the sentence of the clause\n",
"on which it is specified we have a case of stretch.\n",
- "This should not happen. Mothers may point outside their sentences, \n",
+ "This should not happen. Mothers may point outside their sentences,\n",
"but not in the cases that trigger restructuring.\n",
- "Yet, the sanity checks below reveal that this does occur in some versions. \n",
+ "Yet, the sanity checks below reveal that this does occur in some versions.\n",
"We will exclude these cases from further processing."
]
},
@@ -277,14 +277,12 @@
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
- "import sys\n",
"import os\n",
"import collections\n",
- "import random\n",
"\n",
"from tf.fabric import Fabric\n",
"\n",
- "from tree import Tree"
+ "from tree import Tree\n"
]
},
{
@@ -303,19 +301,19 @@
"metadata": {},
"outputs": [],
"source": [
- "VERSION = '2017'\n",
- "BHSA = 'BHSA/tf/{}'.format(VERSION)\n",
- "OUTPUTDIR = '_temp/{}'.format(VERSION)\n",
- "TFDIR = 'tf/{}'.format(VERSION)\n",
+ "VERSION = \"2017\"\n",
+ "BHSA = \"BHSA/tf/{}\".format(VERSION)\n",
+ "OUTPUTDIR = \"_temp/{}\".format(VERSION)\n",
+ "TFDIR = \"tf/{}\".format(VERSION)\n",
"\n",
"os.makedirs(OUTPUTDIR, exist_ok=True)\n",
"os.makedirs(TFDIR, exist_ok=True)\n",
"\n",
- "sp = 'part_of_speech' if VERSION == '3' else 'sp'\n",
- "rela = 'clause_constituent_relation' if VERSION == '3' else 'rela'\n",
- "ptyp = 'phrase_type' if VERSION == '3' else 'typ'\n",
- "ctyp = 'clause_atom_type' if VERSION == '3' else 'typ'\n",
- "g_word_utf8 = 'text' if VERSION == '3' else 'g_word_utf8'"
+ "sp = \"part_of_speech\" if VERSION == \"3\" else \"sp\"\n",
+ "rela = \"clause_constituent_relation\" if VERSION == \"3\" else \"rela\"\n",
+ "ptyp = \"phrase_type\" if VERSION == \"3\" else \"typ\"\n",
+ "ctyp = \"clause_atom_type\" if VERSION == \"3\" else \"typ\"\n",
+ "g_word_utf8 = \"text\" if VERSION == \"3\" else \"g_word_utf8\""
]
},
{
@@ -359,12 +357,14 @@
}
],
"source": [
- "TF = Fabric(locations='~/github/etcbc', modules=BHSA)\n",
- "api = TF.load(f'''\n",
+ "TF = Fabric(locations=\"~/github/etcbc\", modules=BHSA)\n",
+ "api = TF.load(\n",
+ " f\"\"\"\n",
" {sp} {rela} {ptyp} {ctyp}\n",
" {g_word_utf8}\n",
" mother\n",
- "''')\n",
+ "\"\"\"\n",
+ ")\n",
"api.makeAvailableIn(globals())"
]
},
@@ -372,7 +372,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We are going to make convenient labels for constituents, words and clauses, based on the \n",
+ "We are going to make convenient labels for constituents, words and clauses, based on the\n",
"the types of textual objects and the features\n",
"`sp` and `rela`."
]
@@ -391,11 +391,11 @@
"outputs": [],
"source": [
"typeInfo = (\n",
- " (\"word\", ''),\n",
- " (\"subphrase\", 'U'),\n",
- " (\"phrase\", 'P'),\n",
- " (\"clause\", 'C'),\n",
- " (\"sentence\", 'S'),\n",
+ " (\"word\", \"\"),\n",
+ " (\"subphrase\", \"U\"),\n",
+ " (\"phrase\", \"P\"),\n",
+ " (\"clause\", \"C\"),\n",
+ " (\"sentence\", \"S\"),\n",
")\n",
"typeTable = dict(t for t in typeInfo)\n",
"typeOrder = [t[0] for t in typeInfo]"
@@ -448,30 +448,30 @@
"outputs": [],
"source": [
"posTable = {\n",
- " 'adjv': 'aj',\n",
- " 'adjective': 'aj',\n",
- " 'advb': 'av',\n",
- " 'adverb': 'av',\n",
- " 'art': 'dt',\n",
- " 'article': 'dt',\n",
- " 'conj': 'cj',\n",
- " 'conjunction': 'cj',\n",
- " 'inrg': 'ir',\n",
- " 'interrogative': 'ir',\n",
- " 'intj': 'ij',\n",
- " 'interjection': 'ij',\n",
- " 'nega': 'ng',\n",
- " 'negative': 'ng',\n",
- " 'nmpr': 'n-pr',\n",
- " 'pronoun': 'pr',\n",
- " 'prde': 'pr-dem',\n",
- " 'prep': 'pp',\n",
- " 'preposition': 'pp',\n",
- " 'prin': 'pr-int',\n",
- " 'prps': 'pr-ps',\n",
- " 'subs': 'n',\n",
- " 'noun': 'n',\n",
- " 'verb': 'vb',\n",
+ " \"adjv\": \"aj\",\n",
+ " \"adjective\": \"aj\",\n",
+ " \"advb\": \"av\",\n",
+ " \"adverb\": \"av\",\n",
+ " \"art\": \"dt\",\n",
+ " \"article\": \"dt\",\n",
+ " \"conj\": \"cj\",\n",
+ " \"conjunction\": \"cj\",\n",
+ " \"inrg\": \"ir\",\n",
+ " \"interrogative\": \"ir\",\n",
+ " \"intj\": \"ij\",\n",
+ " \"interjection\": \"ij\",\n",
+ " \"nega\": \"ng\",\n",
+ " \"negative\": \"ng\",\n",
+ " \"nmpr\": \"n-pr\",\n",
+ " \"pronoun\": \"pr\",\n",
+ " \"prde\": \"pr-dem\",\n",
+ " \"prep\": \"pp\",\n",
+ " \"preposition\": \"pp\",\n",
+ " \"prin\": \"pr-int\",\n",
+ " \"prps\": \"pr-ps\",\n",
+ " \"subs\": \"n\",\n",
+ " \"noun\": \"n\",\n",
+ " \"verb\": \"vb\",\n",
"}"
]
},
@@ -531,26 +531,26 @@
"outputs": [],
"source": [
"ccrInfo = {\n",
- " 'Adju': ('r', 'Cadju'),\n",
- " 'Appo': ('r', 'Cappo'),\n",
- " 'Attr': ('r', 'Cattr'),\n",
- " 'Cmpl': ('r', 'Ccmpl'),\n",
- " 'Coor': ('x', 'Ccoor'),\n",
- " 'CoVo': ('n', 'Ccovo'),\n",
- " 'Link': ('r', 'Clink'),\n",
- " 'Objc': ('r', 'Cobjc'),\n",
- " 'Para': ('r', 'Cpara'),\n",
- " 'PrAd': ('r', 'Cprad'),\n",
- " 'PreC': ('r', 'Cprec'),\n",
- " 'Pred': ('r', 'Cpred'),\n",
- " 'ReVo': ('n', 'Crevo'),\n",
- " 'Resu': ('n', 'Cresu'),\n",
- " 'RgRc': ('r', 'Crgrc'),\n",
- " 'Sfxs': ('r', 'Csfxs'),\n",
- " 'Spec': ('r', 'Cspec'),\n",
- " 'Subj': ('r', 'Csubj'),\n",
- " 'NA': ('n', 'C'),\n",
- " 'none': ('n', 'C'),\n",
+ " \"Adju\": (\"r\", \"Cadju\"),\n",
+ " \"Appo\": (\"r\", \"Cappo\"),\n",
+ " \"Attr\": (\"r\", \"Cattr\"),\n",
+ " \"Cmpl\": (\"r\", \"Ccmpl\"),\n",
+ " \"Coor\": (\"x\", \"Ccoor\"),\n",
+ " \"CoVo\": (\"n\", \"Ccovo\"),\n",
+ " \"Link\": (\"r\", \"Clink\"),\n",
+ " \"Objc\": (\"r\", \"Cobjc\"),\n",
+ " \"Para\": (\"r\", \"Cpara\"),\n",
+ " \"PrAd\": (\"r\", \"Cprad\"),\n",
+ " \"PreC\": (\"r\", \"Cprec\"),\n",
+ " \"Pred\": (\"r\", \"Cpred\"),\n",
+ " \"ReVo\": (\"n\", \"Crevo\"),\n",
+ " \"Resu\": (\"n\", \"Cresu\"),\n",
+ " \"RgRc\": (\"r\", \"Crgrc\"),\n",
+ " \"Sfxs\": (\"r\", \"Csfxs\"),\n",
+ " \"Spec\": (\"r\", \"Cspec\"),\n",
+ " \"Subj\": (\"r\", \"Csubj\"),\n",
+ " \"NA\": (\"n\", \"C\"),\n",
+ " \"none\": (\"n\", \"C\"),\n",
"}"
]
},
@@ -560,10 +560,15 @@
"metadata": {},
"outputs": [],
"source": [
- "treeTypes = ('sentence', 'clause', 'phrase', 'subphrase', 'word')\n",
- "(rootType, leafType, clauseType, phraseType) = (treeTypes[0], treeTypes[-1], treeTypes[1], treeTypes[2])\n",
- "ccrTable = dict((c[0],c[1][1]) for c in ccrInfo.items())\n",
- "ccrClass = dict((c[0],c[1][0]) for c in ccrInfo.items())"
+ "treeTypes = (\"sentence\", \"clause\", \"phrase\", \"subphrase\", \"word\")\n",
+ "(rootType, leafType, clauseType, phraseType) = (\n",
+ " treeTypes[0],\n",
+ " treeTypes[-1],\n",
+ " treeTypes[1],\n",
+ " treeTypes[2],\n",
+ ")\n",
+ "ccrTable = dict((c[0], c[1][1]) for c in ccrInfo.items())\n",
+ "ccrClass = dict((c[0], c[1][0]) for c in ccrInfo.items())"
]
},
{
@@ -573,7 +578,7 @@
"Now we can actually construct the tree by initializing a tree object.\n",
"After that we call its ``restructureClauses()`` method.\n",
"\n",
- "Then we have two tree structures for each sentence: \n",
+ "Then we have two tree structures for each sentence:\n",
"\n",
"* the *etree*, i.e. the tree obtained by working out the embedding relationships and nothing else\n",
"* the *rtree*, i.e. the tree obtained by restructuring the *etree*\n",
@@ -616,13 +621,15 @@
}
],
"source": [
- "tree = Tree(TF, otypes=treeTypes, \n",
+ "tree = Tree(\n",
+ " TF,\n",
+ " otypes=treeTypes,\n",
" phraseType=phraseType,\n",
" clauseType=clauseType,\n",
" ccrFeature=rela,\n",
" ptFeature=ptyp,\n",
" posFeature=sp,\n",
- " motherFeature='mother',\n",
+ " motherFeature=\"mother\",\n",
")"
]
},
@@ -653,10 +660,10 @@
"source": [
"tree.restructureClauses(ccrClass)\n",
"results = tree.relations()\n",
- "parent = results['rparent']\n",
- "sisters = results['sisters']\n",
- "children = results['rchildren']\n",
- "elderSister = results['elderSister']\n",
+ "parent = results[\"rparent\"]\n",
+ "sisters = results[\"sisters\"]\n",
+ "children = results[\"rchildren\"]\n",
+ "elderSister = results[\"elderSister\"]\n",
"TF.info(\"Ready for processing\")"
]
},
@@ -672,34 +679,34 @@
"metadata": {},
"source": [
"Let us see whether the trees we have constructed satisfy some sanity constraints.\n",
- "After all, the algorithm is based on certain assumptions about the data, \n",
+ "After all, the algorithm is based on certain assumptions about the data,\n",
"but are those assumptions valid?\n",
"And restructuring is a tricky operation, do we have confidence that nothing went wrong?\n",
"\n",
"1. How many sentence nodes? From earlier queries we know what to expect.\n",
- "1. Does any sentence have a parent? \n",
+ "1. Does any sentence have a parent?\n",
" If so, there is something wrong with our assumptions or algorithm.\n",
- "1. Is every top node a sentence? \n",
+ "1. Is every top node a sentence?\n",
" If not, we have material outside a sentence, which contradicts the assumptions.\n",
- "1. Do you reach all sentences if you go up from words? \n",
+ "1. Do you reach all sentences if you go up from words?\n",
" If not, some sentences do not contain words.\n",
- "1. Do you reach all words if you go down from sentences? \n",
+ "1. Do you reach all words if you go down from sentences?\n",
" If not, some words have become disconnected from their sentences.\n",
- "1. Do you reach the same words in reconstructed trees as in embedded trees? \n",
+ "1. Do you reach the same words in reconstructed trees as in embedded trees?\n",
" If not, some sentence material has got lost during the restructuring process.\n",
- "1. From what object types to what object types does the parent relationship link? \n",
- " Here we check that parents do not link object types \n",
+ "1. From what object types to what object types does the parent relationship link?\n",
+ " Here we check that parents do not link object types\n",
" that are too distant in the object type ranking.\n",
- "1. How many nodes have mothers and how many mothers can a node have? \n",
+ "1. How many nodes have mothers and how many mothers can a node have?\n",
" We expect at most one.\n",
"1. From what object types to what object types does the mother relationship link?\n",
- "1. Is the mother of a clause always in the same sentence? \n",
- " If not, foreign sentences will be drawn in, leading to (very) big chunks. \n",
- " This may occur when we use mother relationships in cases where \n",
+ "1. Is the mother of a clause always in the same sentence?\n",
+ " If not, foreign sentences will be drawn in, leading to (very) big chunks.\n",
+ " This may occur when we use mother relationships in cases where\n",
" `rela` has different values than the ones that should trigger restructuring.\n",
- "1. Has the max/average tree depth increased after restructuring? \n",
- " By how much? This is meant as an indication by how much \n",
- " our tree structures improve in significant hierarchy \n",
+ "1. Has the max/average tree depth increased after restructuring?\n",
+ " By how much? This is meant as an indication by how much\n",
+ " our tree structures improve in significant hierarchy\n",
" when we take the mother relationship into account."
]
},
@@ -735,15 +742,19 @@
}
],
"source": [
- "#1\n",
+ "# 1\n",
"expectedSentences = {\n",
- " '3': 71354,\n",
- " '4': 66045,\n",
- " '4b': 63586,\n",
- " '2016': 63570,\n",
- " '2017': 63711,\n",
+ " \"3\": 71354,\n",
+ " \"4\": 66045,\n",
+ " \"4b\": 63586,\n",
+ " \"2016\": 63570,\n",
+ " \"2017\": 63711,\n",
"}\n",
- "TF.info(\"Counting {}s ... (expecting {})\".format(rootType, expectedSentences.get(VERSION, '??')))\n",
+ "TF.info(\n",
+ " \"Counting {}s ... (expecting {})\".format(\n",
+ " rootType, expectedSentences.get(VERSION, \"??\")\n",
+ " )\n",
+ ")\n",
"TF.info(\"There are {} {}s\".format(len(list(F.otype.s(rootType))), rootType))"
]
},
@@ -762,21 +773,23 @@
}
],
"source": [
- "#2\n",
+ "# 2\n",
"TF.info(\"Checking parents of {}s ... (expecting none)\".format(rootType))\n",
"exceptions = set()\n",
"for node in F.otype.s(rootType):\n",
- " if node in parent: exceptions.add(node)\n",
+ " if node in parent:\n",
+ " exceptions.add(node)\n",
"if len(exceptions) == 0:\n",
" TF.info(\"No {} has a parent\".format(rootType))\n",
"else:\n",
" TF.error(\"{} {}s have a parent:\".format(len(exceptions), rootType))\n",
" for n in sorted(exceptions):\n",
" p = parent[n]\n",
- " msg(\"{} {} [{}] has {} parent {} [{}]\".format(\n",
- " rootType, n, tree.slotss(n), \n",
- " F.otype.v(p), p, tree.slotss(p)\n",
- " ))"
+ " TF.error(\n",
+ " \"{} {} [{}] has {} parent {} [{}]\".format(\n",
+ " rootType, n, tree.slotss(n), F.otype.v(p), p, tree.slotss(p)\n",
+ " )\n",
+ " )"
]
},
{
@@ -796,23 +809,29 @@
}
],
"source": [
- "#3 (again a check on #1)\n",
- "TF.info('Checking the types of root nodes ... (should all be {}s)'.format(rootType))\n",
+ "# 3 (again a check on #1)\n",
+ "TF.info(\"Checking the types of root nodes ... (should all be {}s)\".format(rootType))\n",
"expectedTops = {\n",
- " '3': 0,\n",
- " '4': '3 subphrases',\n",
- " '4b': 0,\n",
- " '2016': 0,\n",
- " '2017':0,\n",
+ " \"3\": 0,\n",
+ " \"4\": \"3 subphrases\",\n",
+ " \"4b\": 0,\n",
+ " \"2016\": 0,\n",
+ " \"2017\": 0,\n",
"}\n",
- "TF.info('Expected roots which are non-{}s: {}'.format(rootType, expectedTops.get(VERSION, '??')))\n",
+ "TF.info(\n",
+ " \"Expected roots which are non-{}s: {}\".format(\n",
+ " rootType, expectedTops.get(VERSION, \"??\")\n",
+ " )\n",
+ ")\n",
"exceptions = collections.defaultdict(lambda: [])\n",
"sn = 0\n",
"for node in N.walk():\n",
" otype = F.otype.v(node)\n",
- " if otype not in typeTable: continue\n",
- " if otype == rootType: sn += 1\n",
- " if node not in parent and node not in elderSister and otype != rootType: \n",
+ " if otype not in typeTable:\n",
+ " continue\n",
+ " if otype == rootType:\n",
+ " sn += 1\n",
+ " if node not in parent and node not in elderSister and otype != rootType:\n",
" exceptions[otype].append(node)\n",
"TF.info(\"{} {}s seen\".format(sn, rootType))\n",
"\n",
@@ -824,9 +843,11 @@
" TF.error(\"{}: {}x\".format(t, len(exceptions[t])), tm=False)\n",
"\n",
"for c in exceptions[clauseType]:\n",
- " (s, st) = tree.getRoot(c, 'e')\n",
+ " (s, st) = tree.getRoot(c, \"e\")\n",
" v = rootVerse[s]\n",
- " TF.error(\"{}={}, {}={}={}, verse={}\".format(clauseType, c, rootType, st, s, v), tm=False)"
+ " TF.error(\n",
+ " \"{}={}, {}={}={}, verse={}\".format(clauseType, c, rootType, st, s, v), tm=False\n",
+ " )"
]
},
{
@@ -853,46 +874,56 @@
}
],
"source": [
- "#4, 5\n",
+ "# 4, 5\n",
"def getTop(kind, rel, rela, multi):\n",
" seen = set()\n",
" topNodes = set()\n",
" startNodes = set(F.otype.s(kind))\n",
" nextNodes = startNodes\n",
- " TF.info('Starting from {} nodes ...'.format(kind))\n",
+ " TF.info(\"Starting from {} nodes ...\".format(kind))\n",
" while len(nextNodes):\n",
" newNextNodes = set()\n",
" for node in nextNodes:\n",
- " if node in seen: continue\n",
+ " if node in seen:\n",
+ " continue\n",
" seen.add(node)\n",
" isTop = True\n",
- " if node in rel: \n",
+ " if node in rel:\n",
" isTop = False\n",
" if multi:\n",
- " for c in rel[node]: newNextNodes.add(c)\n",
+ " for c in rel[node]:\n",
+ " newNextNodes.add(c)\n",
" else:\n",
" newNextNodes.add(rel[node])\n",
- " if node in rela: \n",
+ " if node in rela:\n",
" isTop = False\n",
" if multi:\n",
- " for c in rela[node]: newNextNodes.add(c)\n",
+ " for c in rela[node]:\n",
+ " newNextNodes.add(c)\n",
" else:\n",
" newNextNodes.add(rela[node])\n",
- " if isTop: topNodes.add(node)\n",
+ " if isTop:\n",
+ " topNodes.add(node)\n",
" nextNodes = newNextNodes\n",
" topTypes = collections.defaultdict(lambda: 0)\n",
" for t in topNodes:\n",
" topTypes[F.otype.v(t)] += 1\n",
" for t in topTypes:\n",
- " TF.info('From {} {} nodes reached {} {} nodes'.format(len(startNodes), kind, topTypes[t], t), tm=False)\n",
+ " TF.info(\n",
+ " \"From {} {} nodes reached {} {} nodes\".format(\n",
+ " len(startNodes), kind, topTypes[t], t\n",
+ " ),\n",
+ " tm=False,\n",
+ " )\n",
+ "\n",
"\n",
- "TF.info('Embedding trees')\n",
+ "TF.info(\"Embedding trees\")\n",
"getTop(leafType, tree.eparent, {}, False)\n",
"getTop(rootType, tree.echildren, {}, True)\n",
- "TF.info('Restructd trees')\n",
+ "TF.info(\"Restructd trees\")\n",
"getTop(leafType, tree.rparent, tree.elderSister, False)\n",
"getTop(rootType, tree.rchildren, tree.sisters, True)\n",
- "TF.info('Done')"
+ "TF.info(\"Done\")"
]
},
{
@@ -911,40 +942,48 @@
}
],
"source": [
- "#6\n",
- "TF.info('Verifying whether all slots are preserved under restructuring')\n",
+ "# 6\n",
+ "TF.info(\"Verifying whether all slots are preserved under restructuring\")\n",
"expectedMismatches = {\n",
- " '3': 13,\n",
- " '4': 3,\n",
- " '4b': 0,\n",
- " '2016': 0,\n",
- " '2017': 0,\n",
+ " \"3\": 13,\n",
+ " \"4\": 3,\n",
+ " \"4b\": 0,\n",
+ " \"2016\": 0,\n",
+ " \"2017\": 0,\n",
"}\n",
- "TF.info('Expected mismatches: {}'.format(expectedMismatches.get(VERSION, '??')))\n",
+ "TF.info(\"Expected mismatches: {}\".format(expectedMismatches.get(VERSION, \"??\")))\n",
"\n",
"errors = []\n",
- "#i = 10\n",
+ "# i = 10\n",
"for snode in F.otype.s(rootType):\n",
" declaredSlots = set(E.oslots.s(snode))\n",
" results = {}\n",
" thisgood = {}\n",
- " for kind in ('e', 'r'):\n",
- " results[kind] = set(l for l in tree.getLeaves(snode, kind) if F.otype.v(l) == leafType)\n",
+ " for kind in (\"e\", \"r\"):\n",
+ " results[kind] = set(\n",
+ " lf for lf in tree.getLeaves(snode, kind) if F.otype.v(lf) == leafType\n",
+ " )\n",
" thisgood[kind] = declaredSlots == results[kind]\n",
- " #if not thisgood[kind]:\n",
+ " # if not thisgood[kind]:\n",
" # print('{} D={}\\n L={}'.format(kind, declaredSlots, results[kind]))\n",
" # i -= 1\n",
- " #if i == 0: break\n",
- " if False in thisgood.values(): errors.append((snode, thisgood['e'], thisgood['r']))\n",
+ " # if i == 0: break\n",
+ " if False in thisgood.values():\n",
+ " errors.append((snode, thisgood[\"e\"], thisgood[\"r\"]))\n",
"nErrors = len(errors)\n",
"if nErrors:\n",
- " TF.error('{} mismatches:'.format(len(errors)))\n",
+ " TF.error(\"{} mismatches:\".format(len(errors)))\n",
" mine = min(20, len(errors))\n",
" skip |= {e[0] for e in errors}\n",
" for (s, e, r) in errors[0:mine]:\n",
- " TF.error('{} embedding: {}; restructd: {}'.format(s, 'OK' if e else 'XX', 'OK' if r else 'XX'), tm=False)\n",
+ " TF.error(\n",
+ " \"{} embedding: {}; restructd: {}\".format(\n",
+ " s, \"OK\" if e else \"XX\", \"OK\" if r else \"XX\"\n",
+ " ),\n",
+ " tm=False,\n",
+ " )\n",
"else:\n",
- " TF.info('{} mismatches'.format(len(errors)))"
+ " TF.info(\"{} mismatches\".format(len(errors)))"
]
},
{
@@ -978,17 +1017,17 @@
}
],
"source": [
- "#7\n",
- "TF.info('Which types embed which types and how often? ...')\n",
- "for kind in ('e', 'r'):\n",
+ "# 7\n",
+ "TF.info(\"Which types embed which types and how often? ...\")\n",
+ "for kind in (\"e\", \"r\"):\n",
" pLinkedTypes = collections.defaultdict(lambda: 0)\n",
- " parent = tree.eparent if kind == 'e' else tree.rparent\n",
- " kindRep = 'embedding' if kind == 'e' else 'restructd'\n",
+ " parent = tree.eparent if kind == \"e\" else tree.rparent\n",
+ " kindRep = \"embedding\" if kind == \"e\" else \"restructd\"\n",
" for (c, p) in parent.items():\n",
" pLinkedTypes[(F.otype.v(c), F.otype.v(p))] += 1\n",
" TF.info(\"Found {} parent ({}) links between types\".format(len(parent), kindRep))\n",
" for lt in sorted(pLinkedTypes):\n",
- " TF.info('{}: {}x'.format(lt, pLinkedTypes[lt]), tm=False)"
+ " TF.info(\"{}: {}x\".format(lt, pLinkedTypes[lt]), tm=False)"
]
},
{
@@ -1007,18 +1046,23 @@
}
],
"source": [
- "#8\n",
- "TF.info('How many mothers can nodes have? ...')\n",
+ "# 8\n",
+ "TF.info(\"How many mothers can nodes have? ...\")\n",
"motherLen = {}\n",
"for c in N.walk():\n",
" lms = list(E.mother.f(c))\n",
" nms = len(lms)\n",
- " if nms: motherLen[c] = nms\n",
+ " if nms:\n",
+ " motherLen[c] = nms\n",
"count = collections.defaultdict(lambda: 0)\n",
- "for c in tree.mother: count[motherLen[c]] += 1\n",
- "TF.info('There are {} tree nodes with a mother'.format(len(tree.mother)))\n",
+ "for c in tree.mother:\n",
+ " count[motherLen[c]] += 1\n",
+ "TF.info(\"There are {} tree nodes with a mother\".format(len(tree.mother)))\n",
"for cnt in sorted(count):\n",
- " TF.info('{} nodes have {} mother{}'.format(count[cnt], cnt, 's' if cnt != 1 else ''), tm=False) "
+ " TF.info(\n",
+ " \"{} nodes have {} mother{}\".format(count[cnt], cnt, \"s\" if cnt != 1 else \"\"),\n",
+ " tm=False,\n",
+ " )"
]
},
{
@@ -1055,15 +1099,15 @@
}
],
"source": [
- "#9\n",
- "TF.info('Which types have mother links to which types and how often? ...')\n",
+ "# 9\n",
+ "TF.info(\"Which types have mother links to which types and how often? ...\")\n",
"mLinkedTypes = collections.defaultdict(lambda: set())\n",
"for (c, m) in tree.mother.items():\n",
" ctype = F.otype.v(c)\n",
" mLinkedTypes[(ctype, Fs(rela).v(c), F.otype.v(m))].add(c)\n",
- "TF.info('Found {} mother links between types'.format(len(parent)))\n",
+ "TF.info(\"Found {} mother links between types\".format(len(parent)))\n",
"for lt in sorted(mLinkedTypes):\n",
- " TF.info('{}: {}x'.format(lt, len(mLinkedTypes[lt])), tm=False)"
+ " TF.info(\"{}: {}x\".format(lt, len(mLinkedTypes[lt])), tm=False)"
]
},
{
@@ -1082,35 +1126,45 @@
}
],
"source": [
- "#10\n",
- "TF.info('Counting {}s with mothers in another {}'.format(clauseType, rootType))\n",
+ "# 10\n",
+ "TF.info(\"Counting {}s with mothers in another {}\".format(clauseType, rootType))\n",
"expectedOther = {\n",
- " '3': 2,\n",
- " '4': 0,\n",
- " '4b': 0,\n",
- " '2016': 0,\n",
- " '2017': 0,\n",
+ " \"3\": 2,\n",
+ " \"4\": 0,\n",
+ " \"4b\": 0,\n",
+ " \"2016\": 0,\n",
+ " \"2017\": 0,\n",
"}\n",
- "TF.info('Expecting {} {}s with mothers in another {}'.format(expectedOther.get(VERSION, '??'), clauseType, rootType))\n",
+ "TF.info(\n",
+ " \"Expecting {} {}s with mothers in another {}\".format(\n",
+ " expectedOther.get(VERSION, \"??\"), clauseType, rootType\n",
+ " )\n",
+ ")\n",
"exceptions = set()\n",
"for node in tree.mother:\n",
- " if F.otype.v(node) not in typeTable: continue\n",
+ " if F.otype.v(node) not in typeTable:\n",
+ " continue\n",
" mNode = tree.mother[node]\n",
- " sNode = tree.getRoot(node, 'e')\n",
- " smNode = tree.getRoot(mNode, 'e')\n",
+ " sNode = tree.getRoot(node, \"e\")\n",
+ " smNode = tree.getRoot(mNode, \"e\")\n",
" if sNode != smNode:\n",
- " exceptions.add((node, sNode, smNode))\n",
- "TF.info('{} nodes have a mother in another {}'.format(len(exceptions), rootType))\n",
+ " exceptions.add((node, sNode, smNode))\n",
+ "TF.info(\"{} nodes have a mother in another {}\".format(len(exceptions), rootType))\n",
"for (n, sn, smn) in exceptions:\n",
- " TF.error('[{} {}]({}) occurs in {} but has mother in {}'.format(\n",
- " F.otype.v(n), tree.slotss(n), n, sn, smn), tm=False,\n",
+ " TF.error(\n",
+ " \"[{} {}]({}) occurs in {} but has mother in {}\".format(\n",
+ " F.otype.v(n), tree.slotss(n), n, sn, smn\n",
+ " ),\n",
+ " tm=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 23,
- "metadata": {},
+ "metadata": {
+ "lines_to_end_of_cell_marker": 2
+ },
"outputs": [
{
"name": "stdout",
@@ -1128,52 +1182,73 @@
}
],
"source": [
- "#11\n",
- "TF.info('Computing lengths and depths')\n",
+ "# 11\n",
+ "TF.info(\"Computing lengths and depths\")\n",
"nTrees = 0\n",
"rnTrees = 0\n",
- "totalDepth = {'e': 0, 'r': 0}\n",
- "rTotalDepth = {'e': 0, 'r': 0}\n",
- "maxDepth = {'e': 0, 'r':0}\n",
- "rMaxDepth = {'e': 0, 'r': 0}\n",
+ "totalDepth = {\"e\": 0, \"r\": 0}\n",
+ "rTotalDepth = {\"e\": 0, \"r\": 0}\n",
+ "maxDepth = {\"e\": 0, \"r\": 0}\n",
+ "rMaxDepth = {\"e\": 0, \"r\": 0}\n",
"totalLength = 0\n",
"\n",
"for node in F.otype.s(rootType):\n",
" nTrees += 1\n",
" totalLength += tree.length(node)\n",
" thisDepth = {}\n",
- " for kind in ('e', 'r'):\n",
+ " for kind in (\"e\", \"r\"):\n",
" thisDepth[kind] = tree.depth(node, kind)\n",
- " different = thisDepth['e'] != thisDepth['r']\n",
- " if different: rnTrees += 1\n",
- " for kind in ('e', 'r'):\n",
- " if thisDepth[kind] > maxDepth[kind]: maxDepth[kind] = thisDepth[kind]\n",
+ " different = thisDepth[\"e\"] != thisDepth[\"r\"]\n",
+ " if different:\n",
+ " rnTrees += 1\n",
+ " for kind in (\"e\", \"r\"):\n",
+ " if thisDepth[kind] > maxDepth[kind]:\n",
+ " maxDepth[kind] = thisDepth[kind]\n",
" totalDepth[kind] += thisDepth[kind]\n",
" if different:\n",
- " if thisDepth[kind] > rMaxDepth[kind]: rMaxDepth[kind] = thisDepth[kind]\n",
+ " if thisDepth[kind] > rMaxDepth[kind]:\n",
+ " rMaxDepth[kind] = thisDepth[kind]\n",
" rTotalDepth[kind] += thisDepth[kind]\n",
- " \n",
- "TF.info('{} trees seen, of which in {} cases restructuring makes a difference in depth'.format(\n",
- " nTrees, rnTrees,\n",
- "))\n",
+ "\n",
+ "TF.info(\n",
+ " \"{} trees seen, of which in {} cases restructuring makes a difference in depth\".format(\n",
+ " nTrees,\n",
+ " rnTrees,\n",
+ " )\n",
+ ")\n",
"if nTrees > 0:\n",
- " TF.info('Embedding trees: max depth = {:>2}, average depth = {:.2g}'.format(\n",
- " maxDepth['e'], totalDepth['e'] / nTrees,\n",
- " ))\n",
- " TF.info('Restructd trees: max depth = {:>2}, average depth = {:.2g}'.format(\n",
- " maxDepth['r'], totalDepth['r'] / nTrees,\n",
- " ))\n",
+ " TF.info(\n",
+ " \"Embedding trees: max depth = {:>2}, average depth = {:.2g}\".format(\n",
+ " maxDepth[\"e\"],\n",
+ " totalDepth[\"e\"] / nTrees,\n",
+ " )\n",
+ " )\n",
+ " TF.info(\n",
+ " \"Restructd trees: max depth = {:>2}, average depth = {:.2g}\".format(\n",
+ " maxDepth[\"r\"],\n",
+ " totalDepth[\"r\"] / nTrees,\n",
+ " )\n",
+ " )\n",
"if rnTrees > 0:\n",
- " TF.info('Statistics for cases where restructuring makes a difference:')\n",
- " TF.info('Embedding trees: max depth = {:>2}, average depth = {:.2g}'.format(\n",
- " rMaxDepth['e'], rTotalDepth['e'] / rnTrees,\n",
- " ))\n",
- " TF.info('Restructd trees: max depth = {:>2}, average depth = {:.2g}'.format(\n",
- " rMaxDepth['r'], rTotalDepth['r'] / rnTrees,\n",
- " ))\n",
- "TF.info('Total number of leaves in the trees: {}, average number of leaves = {:.2g}'.format(\n",
- " totalLength, totalLength / nTrees,\n",
- "))"
+ " TF.info(\"Statistics for cases where restructuring makes a difference:\")\n",
+ " TF.info(\n",
+ " \"Embedding trees: max depth = {:>2}, average depth = {:.2g}\".format(\n",
+ " rMaxDepth[\"e\"],\n",
+ " rTotalDepth[\"e\"] / rnTrees,\n",
+ " )\n",
+ " )\n",
+ " TF.info(\n",
+ " \"Restructd trees: max depth = {:>2}, average depth = {:.2g}\".format(\n",
+ " rMaxDepth[\"r\"],\n",
+ " rTotalDepth[\"r\"] / rnTrees,\n",
+ " )\n",
+ " )\n",
+ "TF.info(\n",
+ " \"Total number of leaves in the trees: {}, average number of leaves = {:.2g}\".format(\n",
+ " totalLength,\n",
+ " totalLength / nTrees,\n",
+ " )\n",
+ ")"
]
},
{
@@ -1194,34 +1269,40 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
"source": [
"## getTag(node)\n",
"\n",
"This function produces for each node\n",
"\n",
- "* a tag string, \n",
- "* a part-of-speech representation, \n",
+ "* a tag string,\n",
+ "* a part-of-speech representation,\n",
"* a textual position (slot number),\n",
"* a boolean which tells if this node is a leaf or not.\n",
"\n",
"This function will be passed to the `writeTree()` function in the `tree` module.\n",
- "By supplying a different function, you can control a lot of the characteristics of the \n",
+ "By supplying a different function, you can control a lot of the characteristics of the\n",
"written tree."
]
},
{
"cell_type": "code",
"execution_count": 25,
- "metadata": {},
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
"outputs": [],
"source": [
"def getTag(node):\n",
" otype = F.otype.v(node)\n",
" tag = typeTable[otype]\n",
- " if tag == 'P': tag = Fs(ptyp).v(node)\n",
- " elif tag == 'C': tag = ccrTable[Fs(rela).v(node)]\n",
- " isWord = tag == ''\n",
+ " if tag == \"P\":\n",
+ " tag = Fs(ptyp).v(node)\n",
+ " elif tag == \"C\":\n",
+ " tag = ccrTable[Fs(rela).v(node)]\n",
+ " isWord = tag == \"\"\n",
" pos = posTable[Fs(sp).v(node)] if isWord else None\n",
" slot = node if isWord else None\n",
" text = '\"{}\"'.format(Fs(g_word_utf8).v(node)) if isWord else None\n",
@@ -1230,7 +1311,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
"source": [
"This is a variant on `getTag()` where we put the node number into the tag, between `{ }`."
]
@@ -1244,11 +1327,13 @@
"def getTagN(node):\n",
" otype = F.otype.v(node)\n",
" tag = typeTable[otype]\n",
- " if tag == 'P': tag = Fs(ptyp).v(node)\n",
- " elif tag == 'C': tag = ccrTable[Fs(rela).v(node)]\n",
- " isWord = tag == ''\n",
+ " if tag == \"P\":\n",
+ " tag = Fs(ptyp).v(node)\n",
+ " elif tag == \"C\":\n",
+ " tag = ccrTable[Fs(rela).v(node)]\n",
+ " isWord = tag == \"\"\n",
" if not isWord:\n",
- " tag += '{' + str(node) + '}'\n",
+ " tag += \"{\" + str(node) + \"}\"\n",
" pos = posTable[Fs(sp).v(node)] if isWord else None\n",
" slot = node if isWord else None\n",
" text = '\"{}\"'.format(Fs(g_word_utf8).v(node)) if isWord else None\n",
@@ -1285,16 +1370,21 @@
}
],
"source": [
- "TF.info('Exporting {} trees to TF'.format(rootType))\n",
+ "TF.info(\"Exporting {} trees to TF\".format(rootType))\n",
"s = 0\n",
"chunk = 10000\n",
"sc = 0\n",
"treeData = {}\n",
"treeDataN = {}\n",
"for node in F.otype.s(rootType):\n",
- " if node in skip: continue\n",
- " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTag, rev=False, leafNumbers=True)\n",
- " (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node, 'r', getTagN, rev=False, leafNumbers=True)\n",
+ " if node in skip:\n",
+ " continue\n",
+ " (treeRep, wordsRep, bSlot) = tree.writeTree(\n",
+ " node, \"r\", getTag, rev=False, leafNumbers=True\n",
+ " )\n",
+ " (treeNRep, wordsNRep, bSlotN) = tree.writeTree(\n",
+ " node, \"r\", getTagN, rev=False, leafNumbers=True\n",
+ " )\n",
" treeData[node] = treeRep\n",
" treeDataN[node] = treeNRep\n",
" s += 1\n",
@@ -1302,7 +1392,7 @@
" if sc == chunk:\n",
" TF.info(\"{} trees composed\".format(s))\n",
" sc = 0\n",
- "TF.info('{} trees composed'.format(s))"
+ "TF.info(\"{} trees composed\".format(s))"
]
},
{
@@ -1332,25 +1422,25 @@
"nodeFeatures = dict(tree=treeData, treen=treeDataN)\n",
"metaData = dict(\n",
" tree=dict(\n",
- " valueType='str',\n",
- " description='penn treebank represententation for sentences',\n",
- " converter='Dirk Roorda', \n",
- " convertor='trees.ipynb',\n",
- " url='https://github.com/etcbc/trees/trees.ipynb',\n",
- " coreData='BHSA',\n",
+ " valueType=\"str\",\n",
+ " description=\"penn treebank represententation for sentences\",\n",
+ " converter=\"Dirk Roorda\",\n",
+ " convertor=\"trees.ipynb\",\n",
+ " url=\"https://github.com/etcbc/trees/trees.ipynb\",\n",
+ " coreData=\"BHSA\",\n",
" coreVersion=VERSION,\n",
" ),\n",
" treen=dict(\n",
- " valueType='str',\n",
- " description='penn treebank represententation for sentences with node numbers included',\n",
- " converter='Dirk Roorda', \n",
- " convertor='trees.ipynb',\n",
- " url='https://github.com/etcbc/trees/trees.ipynb',\n",
- " coreData='BHSA',\n",
+ " valueType=\"str\",\n",
+ " description=\"penn treebank represententation for sentences with node numbers included\",\n",
+ " converter=\"Dirk Roorda\",\n",
+ " convertor=\"trees.ipynb\",\n",
+ " url=\"https://github.com/etcbc/trees/trees.ipynb\",\n",
+ " coreData=\"BHSA\",\n",
" coreVersion=VERSION,\n",
- " )\n",
+ " ),\n",
")\n",
- "TF.info('Writing tree feature to TF')\n",
+ "TF.info(\"Writing tree feature to TF\")\n",
"TFw = Fabric(locations=TFDIR, silent=True)\n",
"TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)"
]
@@ -1378,7 +1468,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "See the tutorial \n",
+ "See the tutorial\n",
"[trees](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/bhsa/trees.ipynb)\n",
"for how to make use of this feature."
]