From cc5cd94cee7e953e3d6433d09f56ffbefc9345a1 Mon Sep 17 00:00:00 2001 From: Dirk Roorda Date: Thu, 1 Jul 2021 18:18:02 +0200 Subject: [PATCH] linted and blackened notebooks using jupytext --- .flake8 | 4 + programs/remains.ipynb | 175 ++++++----- programs/trees.ipynb | 664 +++++++++++++++++++++++------------------ 3 files changed, 477 insertions(+), 366 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..4468690 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +select = C,E,F,W,B,B950 +ignore = E203, E501, W503 +builtins = C,E,Eall,Es,F,Fall,Fs,L,N,S,T,TF diff --git a/programs/remains.ipynb b/programs/remains.ipynb index dd64aa3..8cc6907 100644 --- a/programs/remains.ipynb +++ b/programs/remains.ipynb @@ -41,29 +41,34 @@ } ], "source": [ - "TF.info('Writing {} trees'.format(rootType))\n", - "treeFile = '{}/trees-BHSA.txt'.format(OUTPUTDIR)\n", - "with open(treeFile, 'w') as trees:\n", - " verseLabel = ''\n", + "TF.info(\"Writing {} trees\".format(rootType))\n", + "treeFile = \"{}/trees-BHSA.txt\".format(OUTPUTDIR)\n", + "with open(treeFile, \"w\") as trees:\n", + " verseLabel = \"\"\n", " s = 0\n", " chunk = 10000\n", " sc = 0\n", " for node in F.otype.s(rootType):\n", - " if node in skip: continue\n", - " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTag, rev=False, leafNumbers=False)\n", - " trees.write('\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n'.format(\n", - " '{} {}:{}'.format(*T.sectionFromNode(node)), \n", - " node,\n", - " bSlot, \n", - " wordsRep,\n", - " treeRep,\n", - " ))\n", + " if node in skip:\n", + " continue\n", + " (treeRep, wordsRep, bSlot) = tree.writeTree(\n", + " node, \"r\", getTag, rev=False, leafNumbers=False\n", + " )\n", + " trees.write(\n", + " \"\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n\".format(\n", + " \"{} {}:{}\".format(*T.sectionFromNode(node)),\n", + " node,\n", + " bSlot,\n", + " wordsRep,\n", + " treeRep,\n", + " )\n", + " )\n", " s += 1\n", " sc += 1\n", " if sc == chunk:\n", " TF.info(\"{} trees written\".format(s))\n", " sc = 0\n", - "TF.info('{} trees written to {}'.format(s, treeFile))" + "TF.info(\"{} trees written to {}\".format(s, treeFile))" ] }, { @@ -130,29 +135,34 @@ } ], "source": [ - "TF.info('Writing {} trees'.format(rootType))\n", - "treeFile = '{}/trees-BHSA-nodes.txt'.format(OUTPUTDIR)\n", - "with open(treeFile, 'w') as trees:\n", - " verseLabel = ''\n", + "TF.info(\"Writing {} trees\".format(rootType))\n", + "treeFile = \"{}/trees-BHSA-nodes.txt\".format(OUTPUTDIR)\n", + "with open(treeFile, \"w\") as trees:\n", + " verseLabel = \"\"\n", " s = 0\n", " chunk = 10000\n", " sc = 0\n", " for node in F.otype.s(rootType):\n", - " if node in skip: continue\n", - " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTagN, rev=False, leafNumbers=False)\n", - " trees.write('\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n'.format(\n", - " '{} {}:{}'.format(*T.sectionFromNode(node)), \n", - " node,\n", - " bSlot, \n", - " wordsRep,\n", - " treeRep,\n", - " ))\n", + " if node in skip:\n", + " continue\n", + " (treeRep, wordsRep, bSlot) = tree.writeTree(\n", + " node, \"r\", getTagN, rev=False, leafNumbers=False\n", + " )\n", + " trees.write(\n", + " \"\\n#{}\\tnode={}\\tbSlot={}\\t{}\\n{}\\n\".format(\n", + " \"{} {}:{}\".format(*T.sectionFromNode(node)),\n", + " node,\n", + " bSlot,\n", + " wordsRep,\n", + " treeRep,\n", + " )\n", + " )\n", " s += 1\n", " sc += 1\n", " if sc == chunk:\n", " TF.info(\"{} trees written\".format(s))\n", " sc = 0\n", - "TF.info('{} trees written to {}'.format(s, treeFile))" + "TF.info(\"{} trees written to {}\".format(s, treeFile))" ] }, { @@ -243,23 +253,32 @@ " vNode = T.nodeFromSection(passage)\n", " return L.d(vNode, otype=rootType)\n", "\n", + "\n", "def showcases(cases, oFile):\n", - " with open(oFile, 'w') as out:\n", + " with open(oFile, \"w\") as out:\n", " for (sNode, caseText) in cases.items():\n", - " out.write('\\n====================\\n{}\\n{}\\n{} TF-node={}:\\n'.format(\n", - " '{} {}:{}'.format(*T.sectionFromNode(sNode)),\n", - " caseText, \n", - " rootType, \n", - " sNode,\n", - " ))\n", - " for kind in ('e', 'r'):\n", - " out.write('\\nTree based on slot embedding {}\\n\\n'.format(\n", - " 'only' if kind == 'e' else ' and mother+clause_constituent relation'\n", - " ))\n", - " (treeRep, wordsRep, bSlot) = tree.writeTree(sNode, kind, getTag, rev=False, leafNumbers=False)\n", - " out.write('{}\\n\\n{}\\n'.format(wordsRep, treeRep))\n", - " out.write('\\nDepth={}\\n'.format(tree.depth(sNode, kind)))\n", - " out.write(tree.debugWriteTree(sNode, kind, legenda=kind=='r'))" + " out.write(\n", + " \"\\n====================\\n{}\\n{}\\n{} TF-node={}:\\n\".format(\n", + " \"{} {}:{}\".format(*T.sectionFromNode(sNode)),\n", + " caseText,\n", + " rootType,\n", + " sNode,\n", + " )\n", + " )\n", + " for kind in (\"e\", \"r\"):\n", + " out.write(\n", + " \"\\nTree based on slot embedding {}\\n\\n\".format(\n", + " \"only\"\n", + " if kind == \"e\"\n", + " else \" and mother+clause_constituent relation\"\n", + " )\n", + " )\n", + " (treeRep, wordsRep, bSlot) = tree.writeTree(\n", + " sNode, kind, getTag, rev=False, leafNumbers=False\n", + " )\n", + " out.write(\"{}\\n\\n{}\\n\".format(wordsRep, treeRep))\n", + " out.write(\"\\nDepth={}\\n\".format(tree.depth(sNode, kind)))\n", + " out.write(tree.debugWriteTree(sNode, kind, legenda=kind == \"r\"))" ] }, { @@ -280,28 +299,30 @@ "source": [ "# below holds for etcbc3, in etcbc4 we have less problem cases\n", "\n", - "problem_desc = collections.OrderedDict((\n", - " (1131739, \"debug reorder\"),\n", - " (1131712, \"interesting\"), \n", - " (1131701, \"interesting\"),\n", - " (1140469, \"subject clause order\"),\n", - " (passageRoots(('Genesis', 1, 16))[0], \"interesting\"), \n", - " (1164864, \"interesting\"),\n", - " (1143081, \"cyclic mothers\"),\n", - " (1153973, \"cyclic mothers\"),\n", - " (1158971, \"cyclic mothers\"),\n", - " (1158971, \"cyclic mothers\"),\n", - " (1160416, \"cyclic mothers\"),\n", - " (1160464, \"cyclic mothers\"),\n", - " (1161141, \"nested cyclic mothers: C.coor => C.attr => P below first C.coor\"), \n", - " (1163666, \"cyclic mothers\"), \n", - " (1164830, \"cyclic mothers\"), \n", - " (1167680, \"cyclic mothers\"), \n", - " (1170057, \"cyclic mothers\"), \n", - " (1193065, \"cyclic mothers\"), \n", - " (1199681, \"cyclic mothers\"), \n", - " (1199682, \"mother points outside sentence\"),\n", - "))\n", + "problem_desc = collections.OrderedDict(\n", + " (\n", + " (1131739, \"debug reorder\"),\n", + " (1131712, \"interesting\"),\n", + " (1131701, \"interesting\"),\n", + " (1140469, \"subject clause order\"),\n", + " (passageRoots((\"Genesis\", 1, 16))[0], \"interesting\"),\n", + " (1164864, \"interesting\"),\n", + " (1143081, \"cyclic mothers\"),\n", + " (1153973, \"cyclic mothers\"),\n", + " (1158971, \"cyclic mothers\"),\n", + " (1158971, \"cyclic mothers\"),\n", + " (1160416, \"cyclic mothers\"),\n", + " (1160464, \"cyclic mothers\"),\n", + " (1161141, \"nested cyclic mothers: C.coor => C.attr => P below first C.coor\"),\n", + " (1163666, \"cyclic mothers\"),\n", + " (1164830, \"cyclic mothers\"),\n", + " (1167680, \"cyclic mothers\"),\n", + " (1170057, \"cyclic mothers\"),\n", + " (1193065, \"cyclic mothers\"),\n", + " (1199681, \"cyclic mothers\"),\n", + " (1199682, \"mother points outside sentence\"),\n", + " )\n", + ")\n", "fixedSample = (\n", " 1167680,\n", " 1167152,\n", @@ -330,23 +351,19 @@ "motherKeys = list(sorted(tree.mother))\n", "for s in range(20):\n", " r = random.randint(0, len(motherKeys) - 1)\n", - " sNode = tree.getRoot(tree.mother[motherKeys[r]], 'e')[0]\n", - " sample[sNode] = 'random sample in {}s with {}s with mothers'.format(rootType, clauseType)\n", + " sNode = tree.getRoot(tree.mother[motherKeys[r]], \"e\")[0]\n", + " sample[sNode] = \"random sample in {}s with {}s with mothers\".format(\n", + " rootType, clauseType\n", + " )\n", "for sNode in fixedSample:\n", - " fSample[sNode] = 'random sample in {}s with {}s with mothers'.format(rootType, clauseType)\n", + " fSample[sNode] = \"random sample in {}s with {}s with mothers\".format(\n", + " rootType, clauseType\n", + " )\n", "\n", - "#showcases(problemDesc, 'tree-notabene.txt')\n", - "#showcases(sample, '{}/trees-{}-random-{}.txt'.format(OUTPUTDIR, VERSION, sampleSize))\n", - "#showcases(fsample, 'trees-fixed-{}.txt'.format(len(fsample)))" + "# showcases(problemDesc, 'tree-notabene.txt')\n", + "# showcases(sample, '{}/trees-{}-random-{}.txt'.format(OUTPUTDIR, VERSION, sampleSize))\n", + "# showcases(fsample, 'trees-fixed-{}.txt'.format(len(fsample)))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf8dbbdf-8700-4528-864e-978c3db635e1", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/programs/trees.ipynb b/programs/trees.ipynb index 0dcfa64..9d4febb 100644 --- a/programs/trees.ipynb +++ b/programs/trees.ipynb @@ -25,7 +25,7 @@ "This notebook composes syntax trees out of the\n", "[BHSA](https://etcbc.github.io/bhsa/) dataset of the Hebrew Bible, its text and it linguistic annotations.\n", "\n", - "The source data is the \n", + "The source data is the\n", "[text-fabric](https://github.com/Dans-labs/text-fabric/wiki) representation of this dataset.\n", "\n", "The result is a set of roughly 65,000 tree structures, one for each sentence, in\n", @@ -67,8 +67,8 @@ "The process of tree construction is not straightforward,\n", "since the BHSA data have not been coded as syntax trees.\n", "Rather they take the shape of a collection of features that describe\n", - "observable characteristics of the words, phrases, clauses and sentences. \n", - "Moreover, if a phrase, clause or sentence is discontinuous, \n", + "observable characteristics of the words, phrases, clauses and sentences.\n", + "Moreover, if a phrase, clause or sentence is discontinuous,\n", "it is divided in *phrase_atoms*, *clause_atoms*,\n", "or *sentence_atoms*, respectively, which are by definition continuous.\n", "\n", @@ -81,11 +81,11 @@ "This notebook makes use of a Python module `tree.py` (in the same directory).\n", "This module works on top of Text-Fabric and knows the general structure of an ancient text.\n", "It constructs a hierarchy of words, subphrases, phrases, clauses and sentences\n", - "based on the embedding relationship. \n", + "based on the embedding relationship.\n", "\n", - "But this is not all. \n", + "But this is not all.\n", "The BHSA data contains a *mother* relationship,\n", - "which denotes linguistic dependency. \n", + "which denotes linguistic dependency.\n", "The module `trees.py` reconstructs the tree obtained from the embedding relationship\n", "by using the mother relationship as a set of instructions to move certain nodes below others.\n", "In some cases extra nodes will be constructed as well." @@ -103,39 +103,39 @@ "metadata": {}, "source": [ "### Nodes:\n", - "The BHSA data is coded in such a way that every node is associated with a *type* and a *slot set*. \n", + "The BHSA data is coded in such a way that every node is associated with a *type* and a *slot set*.\n", "\n", - "The *type* of a node, $T(O)$, determines which features a node has. \n", + "The *type* of a node, $T(O)$, determines which features a node has.\n", "BHSA types are `sentence`, `sentence_atom`,\n", "`clause`, `clause_atom`, `phrase`, `phrase_atom`, `subphrase`, `word`, and there are also\n", "the non-linguistic types `book`, `chapter`, `verse` and `half_verse`.\n", "\n", - "There is an implicit *ordering of node types*, given by the sequence above, where `word` comes first and \n", + "There is an implicit *ordering of node types*, given by the sequence above, where `word` comes first and\n", "`sentence` comes last. We denote this ordering by $<$.\n", "\n", "The *slot set* of a node, $m(O)$, is the set of word occurrences linked to that node.\n", "Every word occurrence in the source occupies a unique slot, which is a number, so slot sets are sets of numbers.\n", "Think of the slots as the textual positions of individual words throughout the whole text.\n", "\n", - "Note that when a sentence contains a clause which contains a phrase, \n", + "Note that when a sentence contains a clause which contains a phrase,\n", "the sentence, clause, and phrase are linked to slot sets that contain each other.\n", "The fact that a sentence \"contains\" a clause is not marked directly,\n", "it is a consequence of how the slot sets they are linked to are embedded.\n", "\n", "### Definition (slot set order):\n", - "There is a \n", + "There is a\n", "[natural order](https://github.com/Dans-labs/text-fabric/wiki/Api#sorting-nodes)\n", "on slot sets, which we will use.\n", "\n", - "We will not base our trees on *all* node types, \n", + "We will not base our trees on *all* node types,\n", "since in the BHSA data they do not constitute a single hierarchy.\n", "We will restrict ourselves to the set $\\cal O = \\{$ ``sentence``, ``clause``, ``phrase``, ``word`` $\\}$.\n", "\n", "### Definition (directly below):\n", - "Node type $T_1$ \n", - "is *directly below* \n", - "$T_2$ ( $T_1 <_1 T_2 $ ) in $\\cal O$ \n", - "if $T_1 < T_2$ \n", + "Node type $T_1$\n", + "is *directly below*\n", + "$T_2$ ( $T_1 <_1 T_2 $ ) in $\\cal O$\n", + "if $T_1 < T_2$\n", "and there is no $T$ in $\\cal O$ with\n", "$T_1 < T < T_2$.\n", "\n", @@ -144,7 +144,7 @@ "\n", "### Definition (parent)\n", "Node $A$ is a parent of node $B$ if the following are true:\n", - "1. $m(A) \\subseteq\\ m(B)$ \n", + "1. $m(A) \\subseteq\\ m(B)$\n", "2. $T(A) <_1 T(B)$ in $\\cal O$." ] }, @@ -159,9 +159,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While using the embedding got us trees, \n", + "While using the embedding got us trees,\n", "using the mother relationship will give us more interesting trees.\n", - "In general, the *mother* in the BHSA dataset points to a node \n", + "In general, the *mother* in the BHSA dataset points to a node\n", "on which the node in question is, in some sense, dependent.\n", "The nature of this dependency is coded in a specific feature on clauses,\n", "the `clause_constituent_relation` in version 3,\n", @@ -179,28 +179,28 @@ "\n", "In case 3 we do nothing.\n", "\n", - "In case 1 we remove the link of the clause to its parent \n", + "In case 1 we remove the link of the clause to its parent\n", "and add the clause as a child to either the node\n", - "that the mother points to, or to the parent of the mother. \n", + "that the mother points to, or to the parent of the mother.\n", "We do the latter only if the mother is a word.\n", "We will not add children to words.\n", "\n", - "In the diagrams, the red arrows represent the mother relationship, \n", - "and the black arrows the embedding relationships, \n", - "and the fat black arrows the new parent relationships. \n", + "In the diagrams, the red arrows represent the mother relationship,\n", + "and the black arrows the embedding relationships,\n", + "and the fat black arrows the new parent relationships.\n", "The gray arrows indicated severed parent links.\n", "\n", "\n", "\n", "In case 2 we create a node between the mother and its parent.\n", - "This node takes the name of the mother, and the mother will be added as child, \n", + "This node takes the name of the mother, and the mother will be added as child,\n", "but with name ``Ccoor``, and the clause which points to the mother is added as a sister.\n", "\n", - "This is a rather complicated case, but the intuition is not that difficult. \n", + "This is a rather complicated case, but the intuition is not that difficult.\n", "Consider the sentence:\n", "\n", " John thinks that Mary said it and did it\n", - " \n", + "\n", "We have a compound object sentence, with ``Mary said it`` and ``did it`` as coordinated components.\n", "The way this has been marked up in the BHSA database is as follows:\n", "\n", @@ -208,47 +208,47 @@ "\n", "``and did it``, clause with ``clause_constituent_relation``=``Coor``, ``mother``=``Mary said it``(clause)\n", "\n", - "So the second coordinated clause is simply linked to the first coordinated clause. \n", - "Restructuring means to create a parent for both coordinated clauses \n", - "and treat both as sisters at the same hierarchical level. \n", + "So the second coordinated clause is simply linked to the first coordinated clause.\n", + "Restructuring means to create a parent for both coordinated clauses\n", + "and treat both as sisters at the same hierarchical level.\n", "See the diagram.\n", "\n", "\n", "\n", "### Note on order\n", - "When we add nodes to new parents, we let them occupy the sequential position \n", + "When we add nodes to new parents, we let them occupy the sequential position\n", "among its new sisters that corresponds with the slot set ordering.\n", "\n", "### Note on discontinuity\n", - "Sentences, clauses and phrases are not always continuous. \n", + "Sentences, clauses and phrases are not always continuous.\n", "Before restructuring it will not always be the case that if you\n", - "walk the tree in pre-order, you will end up with the leaves (the words) \n", + "walk the tree in pre-order, you will end up with the leaves (the words)\n", "in the same order as the original sentence.\n", - "Restructuring generally improves that, because it often puts \n", - "a node under a non-continuous parent object precisely at the location \n", + "Restructuring generally improves that, because it often puts\n", + "a node under a non-continuous parent object precisely at the location\n", "that corresponds with the a gap in the parent.\n", "\n", "However, there is no guarantee that every discontinuity will be resolved in this graceful manner.\n", - "When we create the trees, we also output the list of slot numbers \n", + "When we create the trees, we also output the list of slot numbers\n", "that you get when you walk the tree in pre-order.\n", "Whenever this list is not monotonic, there is an issue with the ordering.\n", "\n", "### Note on cycles\n", - "If a mother points to itself or a descendant of itself, we have a cycle in the mother relationship. \n", - "In these cases, the restructuring algorithm will disconnect a parent link \n", - "without introducing a new link to the tree above it: \n", + "If a mother points to itself or a descendant of itself, we have a cycle in the mother relationship.\n", + "In these cases, the restructuring algorithm will disconnect a parent link\n", + "without introducing a new link to the tree above it:\n", "a whole fragment of the tree becomes disconnected and will get lost.\n", "\n", - "Sanity check 6 below reveals that this occurs in fact 4 times in the BHSA version 4 \n", - "(it occurred 13 times in the BHSA 3 version). \n", + "Sanity check 6 below reveals that this occurs in fact 4 times in the BHSA version 4\n", + "(it occurred 13 times in the BHSA 3 version).\n", "We will exclude these trees from further processing.\n", "\n", "### Note on stretch\n", - "If a mother points outside the sentence of the clause \n", + "If a mother points outside the sentence of the clause\n", "on which it is specified we have a case of stretch.\n", - "This should not happen. Mothers may point outside their sentences, \n", + "This should not happen. Mothers may point outside their sentences,\n", "but not in the cases that trigger restructuring.\n", - "Yet, the sanity checks below reveal that this does occur in some versions. \n", + "Yet, the sanity checks below reveal that this does occur in some versions.\n", "We will exclude these cases from further processing." ] }, @@ -277,14 +277,12 @@ "%load_ext autoreload\n", "%autoreload 2\n", "\n", - "import sys\n", "import os\n", "import collections\n", - "import random\n", "\n", "from tf.fabric import Fabric\n", "\n", - "from tree import Tree" + "from tree import Tree\n" ] }, { @@ -303,19 +301,19 @@ "metadata": {}, "outputs": [], "source": [ - "VERSION = '2017'\n", - "BHSA = 'BHSA/tf/{}'.format(VERSION)\n", - "OUTPUTDIR = '_temp/{}'.format(VERSION)\n", - "TFDIR = 'tf/{}'.format(VERSION)\n", + "VERSION = \"2017\"\n", + "BHSA = \"BHSA/tf/{}\".format(VERSION)\n", + "OUTPUTDIR = \"_temp/{}\".format(VERSION)\n", + "TFDIR = \"tf/{}\".format(VERSION)\n", "\n", "os.makedirs(OUTPUTDIR, exist_ok=True)\n", "os.makedirs(TFDIR, exist_ok=True)\n", "\n", - "sp = 'part_of_speech' if VERSION == '3' else 'sp'\n", - "rela = 'clause_constituent_relation' if VERSION == '3' else 'rela'\n", - "ptyp = 'phrase_type' if VERSION == '3' else 'typ'\n", - "ctyp = 'clause_atom_type' if VERSION == '3' else 'typ'\n", - "g_word_utf8 = 'text' if VERSION == '3' else 'g_word_utf8'" + "sp = \"part_of_speech\" if VERSION == \"3\" else \"sp\"\n", + "rela = \"clause_constituent_relation\" if VERSION == \"3\" else \"rela\"\n", + "ptyp = \"phrase_type\" if VERSION == \"3\" else \"typ\"\n", + "ctyp = \"clause_atom_type\" if VERSION == \"3\" else \"typ\"\n", + "g_word_utf8 = \"text\" if VERSION == \"3\" else \"g_word_utf8\"" ] }, { @@ -359,12 +357,14 @@ } ], "source": [ - "TF = Fabric(locations='~/github/etcbc', modules=BHSA)\n", - "api = TF.load(f'''\n", + "TF = Fabric(locations=\"~/github/etcbc\", modules=BHSA)\n", + "api = TF.load(\n", + " f\"\"\"\n", " {sp} {rela} {ptyp} {ctyp}\n", " {g_word_utf8}\n", " mother\n", - "''')\n", + "\"\"\"\n", + ")\n", "api.makeAvailableIn(globals())" ] }, @@ -372,7 +372,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We are going to make convenient labels for constituents, words and clauses, based on the \n", + "We are going to make convenient labels for constituents, words and clauses, based on the\n", "the types of textual objects and the features\n", "`sp` and `rela`." ] @@ -391,11 +391,11 @@ "outputs": [], "source": [ "typeInfo = (\n", - " (\"word\", ''),\n", - " (\"subphrase\", 'U'),\n", - " (\"phrase\", 'P'),\n", - " (\"clause\", 'C'),\n", - " (\"sentence\", 'S'),\n", + " (\"word\", \"\"),\n", + " (\"subphrase\", \"U\"),\n", + " (\"phrase\", \"P\"),\n", + " (\"clause\", \"C\"),\n", + " (\"sentence\", \"S\"),\n", ")\n", "typeTable = dict(t for t in typeInfo)\n", "typeOrder = [t[0] for t in typeInfo]" @@ -448,30 +448,30 @@ "outputs": [], "source": [ "posTable = {\n", - " 'adjv': 'aj',\n", - " 'adjective': 'aj',\n", - " 'advb': 'av',\n", - " 'adverb': 'av',\n", - " 'art': 'dt',\n", - " 'article': 'dt',\n", - " 'conj': 'cj',\n", - " 'conjunction': 'cj',\n", - " 'inrg': 'ir',\n", - " 'interrogative': 'ir',\n", - " 'intj': 'ij',\n", - " 'interjection': 'ij',\n", - " 'nega': 'ng',\n", - " 'negative': 'ng',\n", - " 'nmpr': 'n-pr',\n", - " 'pronoun': 'pr',\n", - " 'prde': 'pr-dem',\n", - " 'prep': 'pp',\n", - " 'preposition': 'pp',\n", - " 'prin': 'pr-int',\n", - " 'prps': 'pr-ps',\n", - " 'subs': 'n',\n", - " 'noun': 'n',\n", - " 'verb': 'vb',\n", + " \"adjv\": \"aj\",\n", + " \"adjective\": \"aj\",\n", + " \"advb\": \"av\",\n", + " \"adverb\": \"av\",\n", + " \"art\": \"dt\",\n", + " \"article\": \"dt\",\n", + " \"conj\": \"cj\",\n", + " \"conjunction\": \"cj\",\n", + " \"inrg\": \"ir\",\n", + " \"interrogative\": \"ir\",\n", + " \"intj\": \"ij\",\n", + " \"interjection\": \"ij\",\n", + " \"nega\": \"ng\",\n", + " \"negative\": \"ng\",\n", + " \"nmpr\": \"n-pr\",\n", + " \"pronoun\": \"pr\",\n", + " \"prde\": \"pr-dem\",\n", + " \"prep\": \"pp\",\n", + " \"preposition\": \"pp\",\n", + " \"prin\": \"pr-int\",\n", + " \"prps\": \"pr-ps\",\n", + " \"subs\": \"n\",\n", + " \"noun\": \"n\",\n", + " \"verb\": \"vb\",\n", "}" ] }, @@ -531,26 +531,26 @@ "outputs": [], "source": [ "ccrInfo = {\n", - " 'Adju': ('r', 'Cadju'),\n", - " 'Appo': ('r', 'Cappo'),\n", - " 'Attr': ('r', 'Cattr'),\n", - " 'Cmpl': ('r', 'Ccmpl'),\n", - " 'Coor': ('x', 'Ccoor'),\n", - " 'CoVo': ('n', 'Ccovo'),\n", - " 'Link': ('r', 'Clink'),\n", - " 'Objc': ('r', 'Cobjc'),\n", - " 'Para': ('r', 'Cpara'),\n", - " 'PrAd': ('r', 'Cprad'),\n", - " 'PreC': ('r', 'Cprec'),\n", - " 'Pred': ('r', 'Cpred'),\n", - " 'ReVo': ('n', 'Crevo'),\n", - " 'Resu': ('n', 'Cresu'),\n", - " 'RgRc': ('r', 'Crgrc'),\n", - " 'Sfxs': ('r', 'Csfxs'),\n", - " 'Spec': ('r', 'Cspec'),\n", - " 'Subj': ('r', 'Csubj'),\n", - " 'NA': ('n', 'C'),\n", - " 'none': ('n', 'C'),\n", + " \"Adju\": (\"r\", \"Cadju\"),\n", + " \"Appo\": (\"r\", \"Cappo\"),\n", + " \"Attr\": (\"r\", \"Cattr\"),\n", + " \"Cmpl\": (\"r\", \"Ccmpl\"),\n", + " \"Coor\": (\"x\", \"Ccoor\"),\n", + " \"CoVo\": (\"n\", \"Ccovo\"),\n", + " \"Link\": (\"r\", \"Clink\"),\n", + " \"Objc\": (\"r\", \"Cobjc\"),\n", + " \"Para\": (\"r\", \"Cpara\"),\n", + " \"PrAd\": (\"r\", \"Cprad\"),\n", + " \"PreC\": (\"r\", \"Cprec\"),\n", + " \"Pred\": (\"r\", \"Cpred\"),\n", + " \"ReVo\": (\"n\", \"Crevo\"),\n", + " \"Resu\": (\"n\", \"Cresu\"),\n", + " \"RgRc\": (\"r\", \"Crgrc\"),\n", + " \"Sfxs\": (\"r\", \"Csfxs\"),\n", + " \"Spec\": (\"r\", \"Cspec\"),\n", + " \"Subj\": (\"r\", \"Csubj\"),\n", + " \"NA\": (\"n\", \"C\"),\n", + " \"none\": (\"n\", \"C\"),\n", "}" ] }, @@ -560,10 +560,15 @@ "metadata": {}, "outputs": [], "source": [ - "treeTypes = ('sentence', 'clause', 'phrase', 'subphrase', 'word')\n", - "(rootType, leafType, clauseType, phraseType) = (treeTypes[0], treeTypes[-1], treeTypes[1], treeTypes[2])\n", - "ccrTable = dict((c[0],c[1][1]) for c in ccrInfo.items())\n", - "ccrClass = dict((c[0],c[1][0]) for c in ccrInfo.items())" + "treeTypes = (\"sentence\", \"clause\", \"phrase\", \"subphrase\", \"word\")\n", + "(rootType, leafType, clauseType, phraseType) = (\n", + " treeTypes[0],\n", + " treeTypes[-1],\n", + " treeTypes[1],\n", + " treeTypes[2],\n", + ")\n", + "ccrTable = dict((c[0], c[1][1]) for c in ccrInfo.items())\n", + "ccrClass = dict((c[0], c[1][0]) for c in ccrInfo.items())" ] }, { @@ -573,7 +578,7 @@ "Now we can actually construct the tree by initializing a tree object.\n", "After that we call its ``restructureClauses()`` method.\n", "\n", - "Then we have two tree structures for each sentence: \n", + "Then we have two tree structures for each sentence:\n", "\n", "* the *etree*, i.e. the tree obtained by working out the embedding relationships and nothing else\n", "* the *rtree*, i.e. the tree obtained by restructuring the *etree*\n", @@ -616,13 +621,15 @@ } ], "source": [ - "tree = Tree(TF, otypes=treeTypes, \n", + "tree = Tree(\n", + " TF,\n", + " otypes=treeTypes,\n", " phraseType=phraseType,\n", " clauseType=clauseType,\n", " ccrFeature=rela,\n", " ptFeature=ptyp,\n", " posFeature=sp,\n", - " motherFeature='mother',\n", + " motherFeature=\"mother\",\n", ")" ] }, @@ -653,10 +660,10 @@ "source": [ "tree.restructureClauses(ccrClass)\n", "results = tree.relations()\n", - "parent = results['rparent']\n", - "sisters = results['sisters']\n", - "children = results['rchildren']\n", - "elderSister = results['elderSister']\n", + "parent = results[\"rparent\"]\n", + "sisters = results[\"sisters\"]\n", + "children = results[\"rchildren\"]\n", + "elderSister = results[\"elderSister\"]\n", "TF.info(\"Ready for processing\")" ] }, @@ -672,34 +679,34 @@ "metadata": {}, "source": [ "Let us see whether the trees we have constructed satisfy some sanity constraints.\n", - "After all, the algorithm is based on certain assumptions about the data, \n", + "After all, the algorithm is based on certain assumptions about the data,\n", "but are those assumptions valid?\n", "And restructuring is a tricky operation, do we have confidence that nothing went wrong?\n", "\n", "1. How many sentence nodes? From earlier queries we know what to expect.\n", - "1. Does any sentence have a parent? \n", + "1. Does any sentence have a parent?\n", " If so, there is something wrong with our assumptions or algorithm.\n", - "1. Is every top node a sentence? \n", + "1. Is every top node a sentence?\n", " If not, we have material outside a sentence, which contradicts the assumptions.\n", - "1. Do you reach all sentences if you go up from words? \n", + "1. Do you reach all sentences if you go up from words?\n", " If not, some sentences do not contain words.\n", - "1. Do you reach all words if you go down from sentences? \n", + "1. Do you reach all words if you go down from sentences?\n", " If not, some words have become disconnected from their sentences.\n", - "1. Do you reach the same words in reconstructed trees as in embedded trees? \n", + "1. Do you reach the same words in reconstructed trees as in embedded trees?\n", " If not, some sentence material has got lost during the restructuring process.\n", - "1. From what object types to what object types does the parent relationship link? \n", - " Here we check that parents do not link object types \n", + "1. From what object types to what object types does the parent relationship link?\n", + " Here we check that parents do not link object types\n", " that are too distant in the object type ranking.\n", - "1. How many nodes have mothers and how many mothers can a node have? \n", + "1. How many nodes have mothers and how many mothers can a node have?\n", " We expect at most one.\n", "1. From what object types to what object types does the mother relationship link?\n", - "1. Is the mother of a clause always in the same sentence? \n", - " If not, foreign sentences will be drawn in, leading to (very) big chunks. \n", - " This may occur when we use mother relationships in cases where \n", + "1. Is the mother of a clause always in the same sentence?\n", + " If not, foreign sentences will be drawn in, leading to (very) big chunks.\n", + " This may occur when we use mother relationships in cases where\n", " `rela` has different values than the ones that should trigger restructuring.\n", - "1. Has the max/average tree depth increased after restructuring? \n", - " By how much? This is meant as an indication by how much \n", - " our tree structures improve in significant hierarchy \n", + "1. Has the max/average tree depth increased after restructuring?\n", + " By how much? This is meant as an indication by how much\n", + " our tree structures improve in significant hierarchy\n", " when we take the mother relationship into account." ] }, @@ -735,15 +742,19 @@ } ], "source": [ - "#1\n", + "# 1\n", "expectedSentences = {\n", - " '3': 71354,\n", - " '4': 66045,\n", - " '4b': 63586,\n", - " '2016': 63570,\n", - " '2017': 63711,\n", + " \"3\": 71354,\n", + " \"4\": 66045,\n", + " \"4b\": 63586,\n", + " \"2016\": 63570,\n", + " \"2017\": 63711,\n", "}\n", - "TF.info(\"Counting {}s ... (expecting {})\".format(rootType, expectedSentences.get(VERSION, '??')))\n", + "TF.info(\n", + " \"Counting {}s ... (expecting {})\".format(\n", + " rootType, expectedSentences.get(VERSION, \"??\")\n", + " )\n", + ")\n", "TF.info(\"There are {} {}s\".format(len(list(F.otype.s(rootType))), rootType))" ] }, @@ -762,21 +773,23 @@ } ], "source": [ - "#2\n", + "# 2\n", "TF.info(\"Checking parents of {}s ... (expecting none)\".format(rootType))\n", "exceptions = set()\n", "for node in F.otype.s(rootType):\n", - " if node in parent: exceptions.add(node)\n", + " if node in parent:\n", + " exceptions.add(node)\n", "if len(exceptions) == 0:\n", " TF.info(\"No {} has a parent\".format(rootType))\n", "else:\n", " TF.error(\"{} {}s have a parent:\".format(len(exceptions), rootType))\n", " for n in sorted(exceptions):\n", " p = parent[n]\n", - " msg(\"{} {} [{}] has {} parent {} [{}]\".format(\n", - " rootType, n, tree.slotss(n), \n", - " F.otype.v(p), p, tree.slotss(p)\n", - " ))" + " TF.error(\n", + " \"{} {} [{}] has {} parent {} [{}]\".format(\n", + " rootType, n, tree.slotss(n), F.otype.v(p), p, tree.slotss(p)\n", + " )\n", + " )" ] }, { @@ -796,23 +809,29 @@ } ], "source": [ - "#3 (again a check on #1)\n", - "TF.info('Checking the types of root nodes ... (should all be {}s)'.format(rootType))\n", + "# 3 (again a check on #1)\n", + "TF.info(\"Checking the types of root nodes ... (should all be {}s)\".format(rootType))\n", "expectedTops = {\n", - " '3': 0,\n", - " '4': '3 subphrases',\n", - " '4b': 0,\n", - " '2016': 0,\n", - " '2017':0,\n", + " \"3\": 0,\n", + " \"4\": \"3 subphrases\",\n", + " \"4b\": 0,\n", + " \"2016\": 0,\n", + " \"2017\": 0,\n", "}\n", - "TF.info('Expected roots which are non-{}s: {}'.format(rootType, expectedTops.get(VERSION, '??')))\n", + "TF.info(\n", + " \"Expected roots which are non-{}s: {}\".format(\n", + " rootType, expectedTops.get(VERSION, \"??\")\n", + " )\n", + ")\n", "exceptions = collections.defaultdict(lambda: [])\n", "sn = 0\n", "for node in N.walk():\n", " otype = F.otype.v(node)\n", - " if otype not in typeTable: continue\n", - " if otype == rootType: sn += 1\n", - " if node not in parent and node not in elderSister and otype != rootType: \n", + " if otype not in typeTable:\n", + " continue\n", + " if otype == rootType:\n", + " sn += 1\n", + " if node not in parent and node not in elderSister and otype != rootType:\n", " exceptions[otype].append(node)\n", "TF.info(\"{} {}s seen\".format(sn, rootType))\n", "\n", @@ -824,9 +843,11 @@ " TF.error(\"{}: {}x\".format(t, len(exceptions[t])), tm=False)\n", "\n", "for c in exceptions[clauseType]:\n", - " (s, st) = tree.getRoot(c, 'e')\n", + " (s, st) = tree.getRoot(c, \"e\")\n", " v = rootVerse[s]\n", - " TF.error(\"{}={}, {}={}={}, verse={}\".format(clauseType, c, rootType, st, s, v), tm=False)" + " TF.error(\n", + " \"{}={}, {}={}={}, verse={}\".format(clauseType, c, rootType, st, s, v), tm=False\n", + " )" ] }, { @@ -853,46 +874,56 @@ } ], "source": [ - "#4, 5\n", + "# 4, 5\n", "def getTop(kind, rel, rela, multi):\n", " seen = set()\n", " topNodes = set()\n", " startNodes = set(F.otype.s(kind))\n", " nextNodes = startNodes\n", - " TF.info('Starting from {} nodes ...'.format(kind))\n", + " TF.info(\"Starting from {} nodes ...\".format(kind))\n", " while len(nextNodes):\n", " newNextNodes = set()\n", " for node in nextNodes:\n", - " if node in seen: continue\n", + " if node in seen:\n", + " continue\n", " seen.add(node)\n", " isTop = True\n", - " if node in rel: \n", + " if node in rel:\n", " isTop = False\n", " if multi:\n", - " for c in rel[node]: newNextNodes.add(c)\n", + " for c in rel[node]:\n", + " newNextNodes.add(c)\n", " else:\n", " newNextNodes.add(rel[node])\n", - " if node in rela: \n", + " if node in rela:\n", " isTop = False\n", " if multi:\n", - " for c in rela[node]: newNextNodes.add(c)\n", + " for c in rela[node]:\n", + " newNextNodes.add(c)\n", " else:\n", " newNextNodes.add(rela[node])\n", - " if isTop: topNodes.add(node)\n", + " if isTop:\n", + " topNodes.add(node)\n", " nextNodes = newNextNodes\n", " topTypes = collections.defaultdict(lambda: 0)\n", " for t in topNodes:\n", " topTypes[F.otype.v(t)] += 1\n", " for t in topTypes:\n", - " TF.info('From {} {} nodes reached {} {} nodes'.format(len(startNodes), kind, topTypes[t], t), tm=False)\n", + " TF.info(\n", + " \"From {} {} nodes reached {} {} nodes\".format(\n", + " len(startNodes), kind, topTypes[t], t\n", + " ),\n", + " tm=False,\n", + " )\n", + "\n", "\n", - "TF.info('Embedding trees')\n", + "TF.info(\"Embedding trees\")\n", "getTop(leafType, tree.eparent, {}, False)\n", "getTop(rootType, tree.echildren, {}, True)\n", - "TF.info('Restructd trees')\n", + "TF.info(\"Restructd trees\")\n", "getTop(leafType, tree.rparent, tree.elderSister, False)\n", "getTop(rootType, tree.rchildren, tree.sisters, True)\n", - "TF.info('Done')" + "TF.info(\"Done\")" ] }, { @@ -911,40 +942,48 @@ } ], "source": [ - "#6\n", - "TF.info('Verifying whether all slots are preserved under restructuring')\n", + "# 6\n", + "TF.info(\"Verifying whether all slots are preserved under restructuring\")\n", "expectedMismatches = {\n", - " '3': 13,\n", - " '4': 3,\n", - " '4b': 0,\n", - " '2016': 0,\n", - " '2017': 0,\n", + " \"3\": 13,\n", + " \"4\": 3,\n", + " \"4b\": 0,\n", + " \"2016\": 0,\n", + " \"2017\": 0,\n", "}\n", - "TF.info('Expected mismatches: {}'.format(expectedMismatches.get(VERSION, '??')))\n", + "TF.info(\"Expected mismatches: {}\".format(expectedMismatches.get(VERSION, \"??\")))\n", "\n", "errors = []\n", - "#i = 10\n", + "# i = 10\n", "for snode in F.otype.s(rootType):\n", " declaredSlots = set(E.oslots.s(snode))\n", " results = {}\n", " thisgood = {}\n", - " for kind in ('e', 'r'):\n", - " results[kind] = set(l for l in tree.getLeaves(snode, kind) if F.otype.v(l) == leafType)\n", + " for kind in (\"e\", \"r\"):\n", + " results[kind] = set(\n", + " lf for lf in tree.getLeaves(snode, kind) if F.otype.v(lf) == leafType\n", + " )\n", " thisgood[kind] = declaredSlots == results[kind]\n", - " #if not thisgood[kind]:\n", + " # if not thisgood[kind]:\n", " # print('{} D={}\\n L={}'.format(kind, declaredSlots, results[kind]))\n", " # i -= 1\n", - " #if i == 0: break\n", - " if False in thisgood.values(): errors.append((snode, thisgood['e'], thisgood['r']))\n", + " # if i == 0: break\n", + " if False in thisgood.values():\n", + " errors.append((snode, thisgood[\"e\"], thisgood[\"r\"]))\n", "nErrors = len(errors)\n", "if nErrors:\n", - " TF.error('{} mismatches:'.format(len(errors)))\n", + " TF.error(\"{} mismatches:\".format(len(errors)))\n", " mine = min(20, len(errors))\n", " skip |= {e[0] for e in errors}\n", " for (s, e, r) in errors[0:mine]:\n", - " TF.error('{} embedding: {}; restructd: {}'.format(s, 'OK' if e else 'XX', 'OK' if r else 'XX'), tm=False)\n", + " TF.error(\n", + " \"{} embedding: {}; restructd: {}\".format(\n", + " s, \"OK\" if e else \"XX\", \"OK\" if r else \"XX\"\n", + " ),\n", + " tm=False,\n", + " )\n", "else:\n", - " TF.info('{} mismatches'.format(len(errors)))" + " TF.info(\"{} mismatches\".format(len(errors)))" ] }, { @@ -978,17 +1017,17 @@ } ], "source": [ - "#7\n", - "TF.info('Which types embed which types and how often? ...')\n", - "for kind in ('e', 'r'):\n", + "# 7\n", + "TF.info(\"Which types embed which types and how often? ...\")\n", + "for kind in (\"e\", \"r\"):\n", " pLinkedTypes = collections.defaultdict(lambda: 0)\n", - " parent = tree.eparent if kind == 'e' else tree.rparent\n", - " kindRep = 'embedding' if kind == 'e' else 'restructd'\n", + " parent = tree.eparent if kind == \"e\" else tree.rparent\n", + " kindRep = \"embedding\" if kind == \"e\" else \"restructd\"\n", " for (c, p) in parent.items():\n", " pLinkedTypes[(F.otype.v(c), F.otype.v(p))] += 1\n", " TF.info(\"Found {} parent ({}) links between types\".format(len(parent), kindRep))\n", " for lt in sorted(pLinkedTypes):\n", - " TF.info('{}: {}x'.format(lt, pLinkedTypes[lt]), tm=False)" + " TF.info(\"{}: {}x\".format(lt, pLinkedTypes[lt]), tm=False)" ] }, { @@ -1007,18 +1046,23 @@ } ], "source": [ - "#8\n", - "TF.info('How many mothers can nodes have? ...')\n", + "# 8\n", + "TF.info(\"How many mothers can nodes have? ...\")\n", "motherLen = {}\n", "for c in N.walk():\n", " lms = list(E.mother.f(c))\n", " nms = len(lms)\n", - " if nms: motherLen[c] = nms\n", + " if nms:\n", + " motherLen[c] = nms\n", "count = collections.defaultdict(lambda: 0)\n", - "for c in tree.mother: count[motherLen[c]] += 1\n", - "TF.info('There are {} tree nodes with a mother'.format(len(tree.mother)))\n", + "for c in tree.mother:\n", + " count[motherLen[c]] += 1\n", + "TF.info(\"There are {} tree nodes with a mother\".format(len(tree.mother)))\n", "for cnt in sorted(count):\n", - " TF.info('{} nodes have {} mother{}'.format(count[cnt], cnt, 's' if cnt != 1 else ''), tm=False) " + " TF.info(\n", + " \"{} nodes have {} mother{}\".format(count[cnt], cnt, \"s\" if cnt != 1 else \"\"),\n", + " tm=False,\n", + " )" ] }, { @@ -1055,15 +1099,15 @@ } ], "source": [ - "#9\n", - "TF.info('Which types have mother links to which types and how often? ...')\n", + "# 9\n", + "TF.info(\"Which types have mother links to which types and how often? ...\")\n", "mLinkedTypes = collections.defaultdict(lambda: set())\n", "for (c, m) in tree.mother.items():\n", " ctype = F.otype.v(c)\n", " mLinkedTypes[(ctype, Fs(rela).v(c), F.otype.v(m))].add(c)\n", - "TF.info('Found {} mother links between types'.format(len(parent)))\n", + "TF.info(\"Found {} mother links between types\".format(len(parent)))\n", "for lt in sorted(mLinkedTypes):\n", - " TF.info('{}: {}x'.format(lt, len(mLinkedTypes[lt])), tm=False)" + " TF.info(\"{}: {}x\".format(lt, len(mLinkedTypes[lt])), tm=False)" ] }, { @@ -1082,35 +1126,45 @@ } ], "source": [ - "#10\n", - "TF.info('Counting {}s with mothers in another {}'.format(clauseType, rootType))\n", + "# 10\n", + "TF.info(\"Counting {}s with mothers in another {}\".format(clauseType, rootType))\n", "expectedOther = {\n", - " '3': 2,\n", - " '4': 0,\n", - " '4b': 0,\n", - " '2016': 0,\n", - " '2017': 0,\n", + " \"3\": 2,\n", + " \"4\": 0,\n", + " \"4b\": 0,\n", + " \"2016\": 0,\n", + " \"2017\": 0,\n", "}\n", - "TF.info('Expecting {} {}s with mothers in another {}'.format(expectedOther.get(VERSION, '??'), clauseType, rootType))\n", + "TF.info(\n", + " \"Expecting {} {}s with mothers in another {}\".format(\n", + " expectedOther.get(VERSION, \"??\"), clauseType, rootType\n", + " )\n", + ")\n", "exceptions = set()\n", "for node in tree.mother:\n", - " if F.otype.v(node) not in typeTable: continue\n", + " if F.otype.v(node) not in typeTable:\n", + " continue\n", " mNode = tree.mother[node]\n", - " sNode = tree.getRoot(node, 'e')\n", - " smNode = tree.getRoot(mNode, 'e')\n", + " sNode = tree.getRoot(node, \"e\")\n", + " smNode = tree.getRoot(mNode, \"e\")\n", " if sNode != smNode:\n", - " exceptions.add((node, sNode, smNode))\n", - "TF.info('{} nodes have a mother in another {}'.format(len(exceptions), rootType))\n", + " exceptions.add((node, sNode, smNode))\n", + "TF.info(\"{} nodes have a mother in another {}\".format(len(exceptions), rootType))\n", "for (n, sn, smn) in exceptions:\n", - " TF.error('[{} {}]({}) occurs in {} but has mother in {}'.format(\n", - " F.otype.v(n), tree.slotss(n), n, sn, smn), tm=False,\n", + " TF.error(\n", + " \"[{} {}]({}) occurs in {} but has mother in {}\".format(\n", + " F.otype.v(n), tree.slotss(n), n, sn, smn\n", + " ),\n", + " tm=False,\n", " )" ] }, { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "lines_to_end_of_cell_marker": 2 + }, "outputs": [ { "name": "stdout", @@ -1128,52 +1182,73 @@ } ], "source": [ - "#11\n", - "TF.info('Computing lengths and depths')\n", + "# 11\n", + "TF.info(\"Computing lengths and depths\")\n", "nTrees = 0\n", "rnTrees = 0\n", - "totalDepth = {'e': 0, 'r': 0}\n", - "rTotalDepth = {'e': 0, 'r': 0}\n", - "maxDepth = {'e': 0, 'r':0}\n", - "rMaxDepth = {'e': 0, 'r': 0}\n", + "totalDepth = {\"e\": 0, \"r\": 0}\n", + "rTotalDepth = {\"e\": 0, \"r\": 0}\n", + "maxDepth = {\"e\": 0, \"r\": 0}\n", + "rMaxDepth = {\"e\": 0, \"r\": 0}\n", "totalLength = 0\n", "\n", "for node in F.otype.s(rootType):\n", " nTrees += 1\n", " totalLength += tree.length(node)\n", " thisDepth = {}\n", - " for kind in ('e', 'r'):\n", + " for kind in (\"e\", \"r\"):\n", " thisDepth[kind] = tree.depth(node, kind)\n", - " different = thisDepth['e'] != thisDepth['r']\n", - " if different: rnTrees += 1\n", - " for kind in ('e', 'r'):\n", - " if thisDepth[kind] > maxDepth[kind]: maxDepth[kind] = thisDepth[kind]\n", + " different = thisDepth[\"e\"] != thisDepth[\"r\"]\n", + " if different:\n", + " rnTrees += 1\n", + " for kind in (\"e\", \"r\"):\n", + " if thisDepth[kind] > maxDepth[kind]:\n", + " maxDepth[kind] = thisDepth[kind]\n", " totalDepth[kind] += thisDepth[kind]\n", " if different:\n", - " if thisDepth[kind] > rMaxDepth[kind]: rMaxDepth[kind] = thisDepth[kind]\n", + " if thisDepth[kind] > rMaxDepth[kind]:\n", + " rMaxDepth[kind] = thisDepth[kind]\n", " rTotalDepth[kind] += thisDepth[kind]\n", - " \n", - "TF.info('{} trees seen, of which in {} cases restructuring makes a difference in depth'.format(\n", - " nTrees, rnTrees,\n", - "))\n", + "\n", + "TF.info(\n", + " \"{} trees seen, of which in {} cases restructuring makes a difference in depth\".format(\n", + " nTrees,\n", + " rnTrees,\n", + " )\n", + ")\n", "if nTrees > 0:\n", - " TF.info('Embedding trees: max depth = {:>2}, average depth = {:.2g}'.format(\n", - " maxDepth['e'], totalDepth['e'] / nTrees,\n", - " ))\n", - " TF.info('Restructd trees: max depth = {:>2}, average depth = {:.2g}'.format(\n", - " maxDepth['r'], totalDepth['r'] / nTrees,\n", - " ))\n", + " TF.info(\n", + " \"Embedding trees: max depth = {:>2}, average depth = {:.2g}\".format(\n", + " maxDepth[\"e\"],\n", + " totalDepth[\"e\"] / nTrees,\n", + " )\n", + " )\n", + " TF.info(\n", + " \"Restructd trees: max depth = {:>2}, average depth = {:.2g}\".format(\n", + " maxDepth[\"r\"],\n", + " totalDepth[\"r\"] / nTrees,\n", + " )\n", + " )\n", "if rnTrees > 0:\n", - " TF.info('Statistics for cases where restructuring makes a difference:')\n", - " TF.info('Embedding trees: max depth = {:>2}, average depth = {:.2g}'.format(\n", - " rMaxDepth['e'], rTotalDepth['e'] / rnTrees,\n", - " ))\n", - " TF.info('Restructd trees: max depth = {:>2}, average depth = {:.2g}'.format(\n", - " rMaxDepth['r'], rTotalDepth['r'] / rnTrees,\n", - " ))\n", - "TF.info('Total number of leaves in the trees: {}, average number of leaves = {:.2g}'.format(\n", - " totalLength, totalLength / nTrees,\n", - "))" + " TF.info(\"Statistics for cases where restructuring makes a difference:\")\n", + " TF.info(\n", + " \"Embedding trees: max depth = {:>2}, average depth = {:.2g}\".format(\n", + " rMaxDepth[\"e\"],\n", + " rTotalDepth[\"e\"] / rnTrees,\n", + " )\n", + " )\n", + " TF.info(\n", + " \"Restructd trees: max depth = {:>2}, average depth = {:.2g}\".format(\n", + " rMaxDepth[\"r\"],\n", + " rTotalDepth[\"r\"] / rnTrees,\n", + " )\n", + " )\n", + "TF.info(\n", + " \"Total number of leaves in the trees: {}, average number of leaves = {:.2g}\".format(\n", + " totalLength,\n", + " totalLength / nTrees,\n", + " )\n", + ")" ] }, { @@ -1194,34 +1269,40 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "## getTag(node)\n", "\n", "This function produces for each node\n", "\n", - "* a tag string, \n", - "* a part-of-speech representation, \n", + "* a tag string,\n", + "* a part-of-speech representation,\n", "* a textual position (slot number),\n", "* a boolean which tells if this node is a leaf or not.\n", "\n", "This function will be passed to the `writeTree()` function in the `tree` module.\n", - "By supplying a different function, you can control a lot of the characteristics of the \n", + "By supplying a different function, you can control a lot of the characteristics of the\n", "written tree." ] }, { "cell_type": "code", "execution_count": 25, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "def getTag(node):\n", " otype = F.otype.v(node)\n", " tag = typeTable[otype]\n", - " if tag == 'P': tag = Fs(ptyp).v(node)\n", - " elif tag == 'C': tag = ccrTable[Fs(rela).v(node)]\n", - " isWord = tag == ''\n", + " if tag == \"P\":\n", + " tag = Fs(ptyp).v(node)\n", + " elif tag == \"C\":\n", + " tag = ccrTable[Fs(rela).v(node)]\n", + " isWord = tag == \"\"\n", " pos = posTable[Fs(sp).v(node)] if isWord else None\n", " slot = node if isWord else None\n", " text = '\"{}\"'.format(Fs(g_word_utf8).v(node)) if isWord else None\n", @@ -1230,7 +1311,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "This is a variant on `getTag()` where we put the node number into the tag, between `{ }`." ] @@ -1244,11 +1327,13 @@ "def getTagN(node):\n", " otype = F.otype.v(node)\n", " tag = typeTable[otype]\n", - " if tag == 'P': tag = Fs(ptyp).v(node)\n", - " elif tag == 'C': tag = ccrTable[Fs(rela).v(node)]\n", - " isWord = tag == ''\n", + " if tag == \"P\":\n", + " tag = Fs(ptyp).v(node)\n", + " elif tag == \"C\":\n", + " tag = ccrTable[Fs(rela).v(node)]\n", + " isWord = tag == \"\"\n", " if not isWord:\n", - " tag += '{' + str(node) + '}'\n", + " tag += \"{\" + str(node) + \"}\"\n", " pos = posTable[Fs(sp).v(node)] if isWord else None\n", " slot = node if isWord else None\n", " text = '\"{}\"'.format(Fs(g_word_utf8).v(node)) if isWord else None\n", @@ -1285,16 +1370,21 @@ } ], "source": [ - "TF.info('Exporting {} trees to TF'.format(rootType))\n", + "TF.info(\"Exporting {} trees to TF\".format(rootType))\n", "s = 0\n", "chunk = 10000\n", "sc = 0\n", "treeData = {}\n", "treeDataN = {}\n", "for node in F.otype.s(rootType):\n", - " if node in skip: continue\n", - " (treeRep, wordsRep, bSlot) = tree.writeTree(node, 'r', getTag, rev=False, leafNumbers=True)\n", - " (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node, 'r', getTagN, rev=False, leafNumbers=True)\n", + " if node in skip:\n", + " continue\n", + " (treeRep, wordsRep, bSlot) = tree.writeTree(\n", + " node, \"r\", getTag, rev=False, leafNumbers=True\n", + " )\n", + " (treeNRep, wordsNRep, bSlotN) = tree.writeTree(\n", + " node, \"r\", getTagN, rev=False, leafNumbers=True\n", + " )\n", " treeData[node] = treeRep\n", " treeDataN[node] = treeNRep\n", " s += 1\n", @@ -1302,7 +1392,7 @@ " if sc == chunk:\n", " TF.info(\"{} trees composed\".format(s))\n", " sc = 0\n", - "TF.info('{} trees composed'.format(s))" + "TF.info(\"{} trees composed\".format(s))" ] }, { @@ -1332,25 +1422,25 @@ "nodeFeatures = dict(tree=treeData, treen=treeDataN)\n", "metaData = dict(\n", " tree=dict(\n", - " valueType='str',\n", - " description='penn treebank represententation for sentences',\n", - " converter='Dirk Roorda', \n", - " convertor='trees.ipynb',\n", - " url='https://github.com/etcbc/trees/trees.ipynb',\n", - " coreData='BHSA',\n", + " valueType=\"str\",\n", + " description=\"penn treebank represententation for sentences\",\n", + " converter=\"Dirk Roorda\",\n", + " convertor=\"trees.ipynb\",\n", + " url=\"https://github.com/etcbc/trees/trees.ipynb\",\n", + " coreData=\"BHSA\",\n", " coreVersion=VERSION,\n", " ),\n", " treen=dict(\n", - " valueType='str',\n", - " description='penn treebank represententation for sentences with node numbers included',\n", - " converter='Dirk Roorda', \n", - " convertor='trees.ipynb',\n", - " url='https://github.com/etcbc/trees/trees.ipynb',\n", - " coreData='BHSA',\n", + " valueType=\"str\",\n", + " description=\"penn treebank represententation for sentences with node numbers included\",\n", + " converter=\"Dirk Roorda\",\n", + " convertor=\"trees.ipynb\",\n", + " url=\"https://github.com/etcbc/trees/trees.ipynb\",\n", + " coreData=\"BHSA\",\n", " coreVersion=VERSION,\n", - " )\n", + " ),\n", ")\n", - "TF.info('Writing tree feature to TF')\n", + "TF.info(\"Writing tree feature to TF\")\n", "TFw = Fabric(locations=TFDIR, silent=True)\n", "TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)" ] @@ -1378,7 +1468,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "See the tutorial \n", + "See the tutorial\n", "[trees](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/bhsa/trees.ipynb)\n", "for how to make use of this feature." ]