-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit fda8053
Showing
30 changed files
with
9,458 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,343 @@ | ||
from __future__ import division # Corporate math still sucks | ||
from util import dct | ||
from util.fnc import cur,compose,pipe,negate,iseq | ||
from util.reflect import postmortem | ||
from util.lst import concat, avg, fst, snd, car, cdr | ||
from itertools import imap | ||
import lev | ||
from unifeat import unify | ||
from operator import sub, or_, and_ | ||
fs = ['../phonology/dialect/utp02datanew.txt', | ||
'../phonology/dialect/see26datanew.txt', | ||
'../phonology/dialect/sgb20datanew.txt', | ||
'../phonology/dialect/sgj20datanew.txt', | ||
'../phonology/dialect/sgl20datanew.txt', | ||
'../phonology/dialect/sif20datanew.txt', | ||
'../phonology/dialect/siw20datanew.txt', | ||
'../phonology/dialect/siz20datanew.txt', | ||
'../phonology/dialect/siy20datanew.txt', | ||
'../phonology/dialect/smd20datanew.txt',] | ||
def read_unicode(f): | ||
"filename->[[utf-8-char]]" | ||
return map(lambda u: map(lambda s:s.encode('utf8'), u), | ||
file(f).read().decode('utf16').split(u'\n')) | ||
def self_sub(change): | ||
"lev.Rule -> bool -- Is this a boring self-substitution?" | ||
return change.type==lev.SUB and change.src==change.dst | ||
class Hash(): | ||
"Box with proxied __eq__ and __hash__ to allow custom hashing (dict & set)" | ||
def __init__(self, eq, hash, x): | ||
lev.init_attrs(self, locals()) | ||
def __str__(self): | ||
return 'Hash(%s, eq=%s, hash=%s)' % (self.x, self.eq, self.hash) | ||
def __repr__(self): | ||
return 'Hash(eq=%r, hash=%r, x=%r)' % (self.eq, self.hash, self.x) | ||
def __hash__(self): | ||
return self.hash(self.x) | ||
def __eq__(self, other): | ||
return self.eq(self.x, other.x) | ||
def get(self): | ||
return self.x | ||
def cmpset(l, eq, hash): | ||
return set(hx.get() for hx in set(Hash(eq, hash, x) for x in l)) | ||
def collapse_envs(rules): | ||
"[lev.Rule] -> set<lev.Rule>" | ||
return cmpset(rules, lev.Rule.eq_env, lev.Rule.hash_env) | ||
def classify(row): | ||
"[[lev.Rule]] -> {utf-8-char:set<lev.Rule>}" | ||
return dct.map(set, #collapse_envs, | ||
dct.collapse(filter(negate(self_sub), concat(row)), | ||
keymap=lambda rule:rule.src)) | ||
def compare(l1, l2): | ||
"str*str -> [[lev.Rule]]" | ||
lang1 = read_unicode(l1) | ||
lang2 = read_unicode(l2) | ||
dist = lev.totalavgdistance(map(unify, lang1), map(unify, lang2)) | ||
return map(lambda s1,s2:(lev.enviro(s2,s1,dist) if s2 else []), | ||
lang1, lang2) | ||
def run_compare_to_base(fs): | ||
"[str] -> [{utf-8-char:set<lev.Rule>}]" | ||
return map(pipe(cur(compare, fs[0]), classify), fs) | ||
def run_compare_all_to_sgbsiy(fs): | ||
"""[str] -> {utf-8-char:set<lev.Rule>} | ||
(siy<=>sgb) - (map (base<=>) rest)""" | ||
sgb = fs[2] | ||
siy = fs[8] | ||
base = fs[0] | ||
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares | ||
diff = classify(compare(sgb, siy)) | ||
others = map(compose(classify, cur(compare, base)), fs) | ||
# return dct_mapall(lambda v,*rest: reduce(sub, rest, v), diff, *others) | ||
kws = {'default':set()} | ||
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), diff, *others, **kws) | ||
def run_compare_sgbsiy_to_base(fs): | ||
"""[str] -> {utf-8-char:set<lev.Rule>} | ||
((sgb <=> base) | (sgb <=> base)) - (map (<=> base) rest)""" | ||
sgb = fs[2] | ||
siy = fs[8] | ||
base = fs[0] | ||
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares | ||
outsiders = dct.zipwith(or_, | ||
classify(compare(base, sgb)), | ||
classify(compare(base, siy)), | ||
default=set()) | ||
others = map(compose(classify, cur(compare, base)), fs) | ||
kws = {'default':set()} | ||
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), outsiders, *others, **kws) | ||
def run_compare_sgb_and_siy_to_base(fs): | ||
"""[str] -> {utf-8-char:set<lev.Rule>} | ||
((sgb <=> base) & (sgb <=> base)) - (map (<=> base) rest)""" | ||
sgb = fs[2] | ||
siy = fs[8] | ||
base = fs[0] | ||
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares | ||
outsiders = dct.zipwith(and_, | ||
classify(compare(base, sgb)), | ||
classify(compare(base, siy)), | ||
default=set()) | ||
others = map(compose(classify, cur(compare, base)), fs) | ||
kws = {'default':set()} | ||
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), outsiders, *others, **kws) | ||
def run_compare_shared_sgbsiy(fs): | ||
"""this really needs a lenient definition of eq? | ||
(sgb <=> base) & (siy <=> base)""" | ||
sgb = fs[2] | ||
siy = fs[8] | ||
base = fs[0] | ||
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares | ||
return dct.zipwith(and_, | ||
classify(compare(base, sgb)), | ||
classify(compare(base, siy)), | ||
default=set()) | ||
getsrc = lambda rule: rule.src | ||
getdst = lambda rule: rule.dst | ||
getpair = lambda rule: (rule.dst, rule.src) | ||
def run_collapse_differences(fs, get=getdst): | ||
base = fs[0] | ||
del fs[0] | ||
subs = [[get(rule) for rule in concat(compare(base,f)) | ||
if rule.type==lev.SUB and rule.dst!=rule.src] | ||
for f in fs] | ||
return dct.zip(dct.count(concat(subs)), default=0, *map(dct.count, subs)) | ||
def lst_except(l, *ns): | ||
"""Totally inefficient! You have been warned, dude! | ||
(requiring ns to be ordered could help a lot if I actually cared)""" | ||
acc = [] | ||
for i,x in enumerate(l): | ||
if i not in ns: | ||
acc.append(x) | ||
return acc | ||
def find_collapsed(f, collapsed): | ||
"{char:[int]} -> [(char,int)] (sorted)" | ||
return sorted(dct.map(f, collapsed).items(), key=snd, reverse=True) | ||
diff = lambda freqs:avg([freqs[2],freqs[8]]) - avg(lst_except(freqs,0,2,8)) | ||
def variance(freqs): | ||
average = avg(cdr(freqs)) | ||
return sum((average - c)**2 for c in cdr(freqs)) / average | ||
find_difference = cur(find_collapsed, diff) | ||
find_variance = cur(find_collapsed, variance) | ||
def to_html_group_differences(f, name, differences): | ||
print >>f, "<h1>%s</h1>" % name | ||
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><td></td><th>Char</th><th>Variance</th>", | ||
for i, (sub,variance) in enumerate(differences): | ||
if isinstance(sub, tuple): | ||
s = "<tr><td>%s</td><td>%s → %s</td><td>%s</td></tr>" | ||
row = i, sub[1], sub[0], variance | ||
else: | ||
s = "<tr><td>%s</td><td>%s</td><td>%s</td></tr>" | ||
row = i, sub, variance | ||
print >>f, s % row | ||
print >>f, "</table>" | ||
def to_html_variances(f, name, variances): | ||
print >>f, "<h1>%s</h1>" % name | ||
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><th>Char</th><th>Variance</th>", | ||
for pair in variances: | ||
print >>f, "<tr><td>%s</td><td>%s</td></tr>" % pair | ||
print >>f, "</table>" | ||
def to_html_differences(f, name, combined): | ||
"file*str*{char:[int]} " | ||
print >>f, '''<h1>%s</h1>''' % name | ||
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><th>Char</th><th>All</th>", | ||
print >>f, ''.join('<th>%s</th>' % f[21:24] for f in fs[1:]), "<th>Avg</th></tr>" | ||
for char,counts in combined.items(): | ||
print >>f, "<tr><td>%s</td>" % char, | ||
print >>f, ''.join("<td>%s</td>" % c for c in counts), | ||
print >>f, "<td>%.2f</td></tr>" % avg(counts[1:]) | ||
print >>f, "</table>" | ||
def to_html(f,name,row): | ||
print >>f, '''<h1>%s</h1>''' % name | ||
for char,changes in row.items(): | ||
print >>f, '<h2>%s</h2><p>' % char | ||
for change in changes: | ||
print >>f, '%s<br/>' % change.to_html() | ||
print >>f, '</p>' | ||
|
||
if __name__=="__main__": | ||
setup = ((run_compare_sgb_and_siy_to_base, | ||
'rule/smartenv', | ||
'((sgb <=> base) & (siy <=> base)) - (map (<=> base) rest), eq?-rule/smartenv', | ||
'sgb_and_siy_to_base'), | ||
(run_compare_sgb_and_siy_to_base, | ||
'rule', | ||
'((sgb <=> base) & (siy <=> base)) - (map (<=> base) rest), eq?-rule', | ||
'sgb_and_siy_to_base-simple'), | ||
(run_compare_sgbsiy_to_base, | ||
'rule', | ||
'((sgb <=> base) | (siy <=> base)) - (map (<=> base) rest), eq?-rule', | ||
'sgbsiy_to_base-simple'), | ||
(run_compare_shared_sgbsiy, | ||
'all', | ||
'(sgb <=> base) & (siy <=> base), eq?-all', | ||
'shared_sgbsiy-full'), | ||
(run_compare_shared_sgbsiy, | ||
'rule', | ||
'(sgb <=> base) & (siy <=> base), eq?-rule', | ||
'shared_sgbsiy-simple'), | ||
(run_compare_shared_sgbsiy, | ||
'rule/smartenv', | ||
'(sgb <=> base) & (siy <=> base), eq?-rule/smartenv', | ||
'shared_sgbsiy') | ||
) | ||
setup = ((run_collapse_differences, | ||
'xxx', | ||
'Counting differences', | ||
'count_differences'),) | ||
for run,rule,title,fname in setup: | ||
#f = open('align_'+fname+'-revised.html', 'w') | ||
f = open(fname+'.html', 'w') | ||
print >>f, '''<html><head> | ||
<meta http-equiv="content-type" content="text-html; charset=utf-8"> | ||
<title>Observed changes from baseline English</title></head><body>''' | ||
lev.setRuleCompare(rule) | ||
#to_html(f, title, run(list(fs))) | ||
for attr in (getsrc,getdst,getpair): | ||
to_html_group_differences(f, | ||
"%s – %s" % (title,attr.func_name), | ||
find_difference(run(list(fs), attr))) | ||
## to_html_differences(f, | ||
## "%s – %s" % (title,attr.func_name), | ||
## run(list(fs), attr)) | ||
|
||
## map(to_html, | ||
## ('utp02', 'see26', 'sgb20', 'sgj20', 'sgl20', 'sif20','siw20','siz20'), | ||
## run(fs)) | ||
print >>f, '</body></html>' | ||
f.close() | ||
# number of things that got thrown out because they were shared | ||
{'': 18, | ||
'\xc9\x99': 3, | ||
'\xc9\x9b': 1, | ||
'b': 7, | ||
'e': 1, | ||
'd': 2, | ||
'\xc9\x91': 6, | ||
'k': 8, | ||
'j': 1, | ||
'\xca\xb0': 6, | ||
'\xc9\x94': 3, | ||
'o': 3, | ||
'n': 2, | ||
'p': 1, | ||
's': 3, | ||
'\xc9\xaa': 4, | ||
't': 7, | ||
'\xca\x8a': 7, | ||
'v': 0, | ||
'w': 2, | ||
'\xca\x83': 0} | ||
{'': 31, | ||
'\xc9\x99': 4, | ||
'\xc9\x9b': 2, | ||
'b': 8, | ||
'e': 1, | ||
'd': 2, | ||
'\xc9\x91': 7, | ||
'k': 8, | ||
'j': 2, | ||
'\xca\xb0': 8, | ||
'\xc9\x94': 4, | ||
'o': 4, | ||
'n': 3, | ||
'p': 5, | ||
's': 6, | ||
'\xc9\xaa': 10, | ||
't': 9, | ||
'\xca\x8a': 7, | ||
'v': 1, | ||
'w': 2, | ||
'\xca\x83': 1} | ||
# changes that weren't even shared by at least one of the others | ||
# (so actually these should be calculated also at some point) | ||
set(['\xc9\x9c', '\xc9\x92', '\xc9\xbe', '\xc3\xa7', '\xc3\xa6', '\xc9\xa8', '\xc9\xab', '\xc9\xac', '\xc9\xaf', '\xc3\xb0', '\xca\x94', '\xce\xb8', '\xca\x8f', '\xca\x89', '\xca\x82', 'a', '\xca\x8c', 'g', 'f', 'i', 'h', 'm', 'l', 'r', 'z']) | ||
# segments with [dorsal] cause a violation (in the OT paper) (this def is a | ||
# little weak) | ||
# try lining up everything logically for the HTML dump | ||
# clustering is really clustering pathologies and then I would like to extract | ||
# the details so that treatment can be prescribed once particular patterns of | ||
# deafness are identified and categorised | ||
# worked on Tuesday 1.5 h looking for this paper. | ||
# Wednesday 2 h + 8.25-22 | ||
# Thursday: 8:30 - 11:45 | ||
# Friday: 9:15 - 9:45 | ||
# try to find 'guidelines for constraints' paper again | ||
# I uh, can't find this, but here some cool papers on the ROA (abstract only) | ||
# 909: Boersma and Hamann show that the 'prototype effect' can be derived | ||
# by OT simulations who optimise their grammars. | ||
# 484: Jonas Kuhn's thesis on computational OT syntax (OT-LFG) | ||
# 895: An alternative to iterative footing (might be relevant, the abstract | ||
# seems a little confused) | ||
# 888: Proves that the computational complexity of stochastic OT learning | ||
# algorithms is k-1 | ||
# 883: Tessier's BCD dissertation | ||
# 878 (823) (844.8): McCarthy's OT-CC 'Slouching toward optimality' | ||
# 873: Antilla's T-orders | ||
# 872: Pater et al: Harmonic Grammars translate into linear systems. I think | ||
# these grammars are supersets of OT grammars. They have code available. | ||
# 863/864: Andries Coetzee: I think this is the weird non-OT talk he gave at | ||
# Phonology Fest weekend. (His dissertation is at 687) | ||
# 858: Hayes and Wilson's Maximum Entropy Learning | ||
# 851: Oostendorp argues against Port's incomplete neutralisation | ||
# 835: On-line learning of underlying forms. 10 pages! But it's magic! | ||
# 818: Use a freakin' machine to do OT! Also, Finnish is hard. | ||
# 811: Learn underlying forms by restricting search to a lexical subspace. (short too) | ||
# 798: Prince turns OT back into Harmony Theory via 'utility functions'?? | ||
# 844.12: Tesar talks about learning paradigms | ||
# 794: Hey! It's those FRed people! But with a paper instead of bad Ruby. | ||
# 780: Pater shows how to handle variation with an RCD-family algorithm | ||
# 746: Apouussidou and Boersma compare GLA and EDCD for learning stress. | ||
# GLA is better. Surprise! | ||
# 739: Pater modifies BCD to learn Stratal OT (?) grammars. | ||
# 695: Tesar creates Constrast Analysis for learning (see 811) | ||
# 688: Generating 'contenders' from an infinite list of candidates. FSTs+RCD | ||
# 683: McCarthy shows how to learn faithful /B/->[B] mappings after having | ||
# learnt the harder /A/->[B] one. | ||
# 672..675: Keller and Asudeh: GLA sucks! (although RCD does too) | ||
# 638: Boersma reviews Tesar & Smolensky 2000 and says that learnability means | ||
# that not all factorial typologies are possible??! | ||
# 625: Jaeger: compares Stochastic OT with Boersma's MaxEnt model and shows | ||
# that you can get GLA to work with Maximum Entropy too and you get | ||
# guaranteed convergence | ||
# 620: Tesar and Prince use phonotactics (?) to learn phono. alternations | ||
# 618/619: " " et al add inconsistency detection to BCD, speeding it up | ||
# 610: A U Mass thesis on syncope | ||
# 600: Some constraints generate violations quadratic in the length of the word | ||
# like Align(Foot, Word), so you can prove that OT phonology is not regular. | ||
# 592: Catalan may have similar syllabification to Mongolian in its clitics | ||
# 562: Prince explains comparative tableaux (I think have this already) | ||
# 544: Jaeger: Proposes Bidirectional GLA (sets up a speaker/hearer loop?) | ||
# 537: Prince and Smolensky's original OT manuscript, revised slightly | ||
# 536: Prince explores alternative architectures more similar to Harmny Theory | ||
# from the 80s. And sees what happens. | ||
# 500: Entailed Rankings Arguments: Prince formalises what a machine needs to | ||
# know to do OT. (I think I have this already) | ||
# 463: Somebody wrote a contraint runner in 2001. As usual, works on stress. | ||
# 459: More candidates than atoms in the universe: somebody bad at math debunks | ||
# OT again. (n m) not n!m! maybe... | ||
# 446: Broselow writes about Stress-epenthesis interactions. | ||
# (I may have this already) | ||
# 426: (Tesar introduces inconsistency detection) | ||
# 418: Lombardi explains why L2 English speakers use either [s] or [t] based on L1 | ||
# 400: Minimal constraint demotion in (human) acquisition of German | ||
# 392: Argument that pure Lexicon Optimisation is too restrictive | ||
# 390: Michael Hammond does some more logic<=>OT isomorphisms | ||
# |
Oops, something went wrong.