Skip to content

Commit

Permalink
[#58] Refactor string flattening and include 1XX $d
Browse files Browse the repository at this point in the history
  • Loading branch information
danmichaelo committed Jun 29, 2018
1 parent 679a9a7 commit f81ce79
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 52 deletions.
102 changes: 53 additions & 49 deletions mc2skos/element.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# encoding=utf8

import re
from functools import reduce
from lxml import etree
import re


class Element(object):
Expand Down Expand Up @@ -35,60 +36,63 @@ def text(self, xpath=None, all=False):
# all: True to return an array with the text content for all matching elements.
# False to return a string with the text content of the first matching element, or None.
# Returns text content of first node or None

def flatten_text(node):
# Captions can include Processing Instruction tags, like in this example
# (linebreaks added):
#
# <mx:subfield xmlns:mx="http://www.loc.gov/MARC21/slim"
# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" code="t">
# <?ddc fotag="fo:inline" font-style="italic"?>L
# <?ddc fotag="fo:inline" vertical-align="super" font-size="70%"?>p
# <?ddc fotag="/fo:inline"?>
# <?ddc fotag="/fo:inline"?>-rom
# </mx:subfield>
#
# The code below just strips away the PI tags, giving "Lp-rom" for this example.
children = node.getchildren()
if len(children) != 0:
value = ''
for child in children:
if child.tail is not None:
value += child.tail
else:
value = node.text
return value

if xpath is None:
return self.node.text
return flatten_text(self.node)
if all:
return [res.node.text for res in self.all(xpath) if res.node.text is not None]
return [flatten_text(res.node) for res in self.all(xpath) if res.node.text is not None]
for res in self.all(xpath):
return res.node.text # return text of first element
return flatten_text(res.node) # return text of first element

def get_ess_codes(self):
return [x[4:] for x in self.node.xpath('mx:subfield[@code="9"]/text()', namespaces=self.nsmap) if x.find('ess=') == 0]

def reduce(self, fn, subfields=['a', 'c', 'i', 't', 'x'], initializer=''):
codes = ['@code="%s"' % code for code in subfields]
return reduce(fn, self.all('mx:subfield[%s]' % ' or '.join(codes)), initializer)

def stringify(self, subfields=['a', 'c', 'i', 't', 'x']):
note = ''
for subfield in self.node.xpath('mx:subfield', namespaces=self.nsmap):
def inner(label, subfield):
code = subfield.get('code')
if code in subfields:

# Captions can include Processing Instruction tags, like in this example
# (linebreaks added):
#
# <mx:subfield xmlns:mx="http://www.loc.gov/MARC21/slim"
# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" code="t">
# <?ddc fotag="fo:inline" font-style="italic"?>L
# <?ddc fotag="fo:inline" vertical-align="super" font-size="70%"?>p
# <?ddc fotag="/fo:inline"?>
# <?ddc fotag="/fo:inline"?>-rom
# </mx:subfield>
#
# The code below just strips away the PI tags, giving "Lp-rom" for this example.

children = subfield.getchildren()
if len(children) != 0:
txt = ''
for child in children:
if child.tail is not None:
txt += child.tail
else:
txt = subfield.text

if txt is None:
continue

# Check if we need to add a delimiter
if code == 'c':
# Treat $c as the end of a number span, which is correct for the 6XX fields
# in MARC21 Classification. In Marc21 Authority, $c generally seems to be
# undefined, but we might add some checks here if there are some $c subfields
# that need to be treated differently.
note += '-'

elif len(note) != 0 and not re.match(r'[.\?#@+,<>%~`!$^&\(\):;\]]', txt[0]):
# Unless the subfield starts with a punctuation character, we will add a space.
note += ' '

# Append the subfield text to the note
note += txt

return note
value = subfield.text()
if value is None:
return label

# Check if we need to add a separator
if code == 'c':
# Treat $c as the end of a number span, which is correct for the 6XX fields
# in MARC21 Classification. In Marc21 Authority, $c generally seems to be
# undefined, but we might add some checks here if there are some $c subfields
# that need to be treated differently.
value = '-' + value

elif len(label) != 0 and not re.match(r'[.\?#@+,<>%~`!$^&\(\):;\]]', value[0]):
# Unless the subfield starts with a punctuation character, we will add a space.
value = ' ' + value

return label + value

return self.reduce(inner, subfields)
23 changes: 20 additions & 3 deletions mc2skos/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,25 @@ def get_terms(self, base='1'):
# X62 - Medium of Performance Term
tags = ['@tag="%s%s"' % (base, tag) for tag in ['00', '10', '11', '30', '47', '48', '50', '51', '53', '55', '62']]
for entry in self.record.all('mx:datafield[%s]' % ' or '.join(tags)):
codes = ['@code="%s"' % code for code in ['a', 'x', 'y', 'z', 'v']]
term_parts = entry.text('mx:subfield[%s]' % ' or '.join(codes), True)

def reducer(value, element):
prefix = ' '
suffix = ''

if value == '':
prefix = ''
elif element.get('code') == 'd' and value[-1] not in [',', ';']:
prefix = ' ('
suffix = ')'
elif element.get('code') in ['x', 'y', 'z', 'v']:
prefix = '--'

return value + prefix + element.text() + suffix

label = entry.reduce(reducer, ['a', 'd', 'x', 'y', 'z', 'v'])

# codes = ['@code="%s"' % code for code in ['a', 'd', 'x', 'y', 'z', 'v']]
# term_parts = entry.text('mx:subfield[%s]' % ' or '.join(codes), True)
cn = entry.text('mx:subfield[@code="0"]')
cni = None
if cn is not None:
Expand All @@ -95,7 +112,7 @@ def get_terms(self, base='1'):
else:
cn = cn[0]
yield {
'value': '--'.join(term_parts),
'value': label,
'node': entry,
'control_number': cn,
'control_number_identifier': cni,
Expand Down

0 comments on commit f81ce79

Please sign in to comment.