[#58] Refactor string flattening and include 1XX $d

scriptotek · Jun 29, 2018 · f81ce79 · f81ce79
1 parent 679a9a7
commit f81ce79
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 52 deletions.
diff --git a/mc2skos/element.py b/mc2skos/element.py
@@ -1,7 +1,8 @@
 # encoding=utf8
 
-import re
+from functools import reduce
 from lxml import etree
+import re
 
 
 class Element(object):
@@ -35,60 +36,63 @@ def text(self, xpath=None, all=False):
         # all: True to return an array with the text content for all matching elements.
         #      False to return a string with the text content of the first matching element, or None.
         # Returns text content of first node or None
+
+        def flatten_text(node):
+            # Captions can include Processing Instruction tags, like in this example
+            # (linebreaks added):
+            #
+            #   <mx:subfield xmlns:mx="http://www.loc.gov/MARC21/slim"
+            #                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" code="t">
+            #     <?ddc fotag="fo:inline" font-style="italic"?>L
+            #       <?ddc fotag="fo:inline" vertical-align="super" font-size="70%"?>p
+            #       <?ddc fotag="/fo:inline"?>
+            #     <?ddc fotag="/fo:inline"?>-rom
+            #   </mx:subfield>
+            #
+            # The code below just strips away the PI tags, giving "Lp-rom" for this example.
+            children = node.getchildren()
+            if len(children) != 0:
+                value = ''
+                for child in children:
+                    if child.tail is not None:
+                        value += child.tail
+            else:
+                value = node.text
+            return value
+
         if xpath is None:
-            return self.node.text
+            return flatten_text(self.node)
         if all:
-            return [res.node.text for res in self.all(xpath) if res.node.text is not None]
+            return [flatten_text(res.node) for res in self.all(xpath) if res.node.text is not None]
         for res in self.all(xpath):
-            return res.node.text  # return text of first element
+            return flatten_text(res.node)  # return text of first element
 
     def get_ess_codes(self):
         return [x[4:] for x in self.node.xpath('mx:subfield[@code="9"]/text()', namespaces=self.nsmap) if x.find('ess=') == 0]
 
+    def reduce(self, fn, subfields=['a', 'c', 'i', 't', 'x'], initializer=''):
+        codes = ['@code="%s"' % code for code in subfields]
+        return reduce(fn, self.all('mx:subfield[%s]' % ' or '.join(codes)), initializer)
+
     def stringify(self, subfields=['a', 'c', 'i', 't', 'x']):
-        note = ''
-        for subfield in self.node.xpath('mx:subfield', namespaces=self.nsmap):
+        def inner(label, subfield):
             code = subfield.get('code')
-            if code in subfields:
-
-                # Captions can include Processing Instruction tags, like in this example
-                # (linebreaks added):
-                #
-                #   <mx:subfield xmlns:mx="http://www.loc.gov/MARC21/slim"
-                #                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" code="t">
-                #     <?ddc fotag="fo:inline" font-style="italic"?>L
-                #       <?ddc fotag="fo:inline" vertical-align="super" font-size="70%"?>p
-                #       <?ddc fotag="/fo:inline"?>
-                #     <?ddc fotag="/fo:inline"?>-rom
-                #   </mx:subfield>
-                #
-                # The code below just strips away the PI tags, giving "Lp-rom" for this example.
-
-                children = subfield.getchildren()
-                if len(children) != 0:
-                    txt = ''
-                    for child in children:
-                        if child.tail is not None:
-                            txt += child.tail
-                else:
-                    txt = subfield.text
-
-                if txt is None:
-                    continue
-
-                # Check if we need to add a delimiter
-                if code == 'c':
-                    # Treat $c as the end of a number span, which is correct for the 6XX fields
-                    # in MARC21 Classification. In Marc21 Authority, $c generally seems to be
-                    # undefined, but we might add some checks here if there are some $c subfields
-                    # that need to be treated differently.
-                    note += '-'
-
-                elif len(note) != 0 and not re.match(r'[.\?#@+,<>%~`!$^&\(\):;\]]', txt[0]):
-                    # Unless the subfield starts with a punctuation character, we will add a space.
-                    note += ' '
-
-                # Append the subfield text to the note
-                note += txt
-
-        return note
+            value = subfield.text()
+            if value is None:
+                return label
+
+            # Check if we need to add a separator
+            if code == 'c':
+                # Treat $c as the end of a number span, which is correct for the 6XX fields
+                # in MARC21 Classification. In Marc21 Authority, $c generally seems to be
+                # undefined, but we might add some checks here if there are some $c subfields
+                # that need to be treated differently.
+                value = '-' + value
+
+            elif len(label) != 0 and not re.match(r'[.\?#@+,<>%~`!$^&\(\):;\]]', value[0]):
+                # Unless the subfield starts with a punctuation character, we will add a space.
+                value = ' ' + value
+
+            return label + value
+
+        return self.reduce(inner, subfields)
diff --git a/mc2skos/record.py b/mc2skos/record.py
@@ -83,8 +83,25 @@ def get_terms(self, base='1'):
         # X62 - Medium of Performance Term
         tags = ['@tag="%s%s"' % (base, tag) for tag in ['00', '10', '11', '30', '47', '48', '50', '51', '53', '55', '62']]
         for entry in self.record.all('mx:datafield[%s]' % ' or '.join(tags)):
-            codes = ['@code="%s"' % code for code in ['a', 'x', 'y', 'z', 'v']]
-            term_parts = entry.text('mx:subfield[%s]' % ' or '.join(codes), True)
+
+            def reducer(value, element):
+                prefix = ' '
+                suffix = ''
+
+                if value == '':
+                    prefix = ''
+                elif element.get('code') == 'd' and value[-1] not in [',', ';']:
+                    prefix = ' ('
+                    suffix = ')'
+                elif element.get('code') in ['x', 'y', 'z', 'v']:
+                    prefix = '--'
+
+                return value + prefix + element.text() + suffix
+
+            label = entry.reduce(reducer, ['a', 'd', 'x', 'y', 'z', 'v'])
+
+            # codes = ['@code="%s"' % code for code in ['a', 'd', 'x', 'y', 'z', 'v']]
+            # term_parts = entry.text('mx:subfield[%s]' % ' or '.join(codes), True)
             cn = entry.text('mx:subfield[@code="0"]')
             cni = None
             if cn is not None:
@@ -95,7 +112,7 @@ def get_terms(self, base='1'):
                 else:
                     cn = cn[0]
             yield {
-                'value': '--'.join(term_parts),
+                'value': label,
                 'node': entry,
                 'control_number': cn,
                 'control_number_identifier': cni,