diff --git a/ACL07_poster_equations.tex b/ACL07_poster_equations.tex
new file mode 100644
index 0000000..d97c713
--- /dev/null
+++ b/ACL07_poster_equations.tex
@@ -0,0 +1,25 @@
+\documentclass[11pt]{article}
+\usepackage[vcentering]{geometry} % dvips not needed for pdflatex
+% \usepackage[vcentering,dvips]{geometry}
+% % eqn 1
+% \geometry{papersize={20mm,15mm},total={30mm,24mm}}
+% % eqn 1
+% \geometry{papersize={50mm,18mm},total={65mm,26mm}}
+% % eqn 1
+% \geometry{papersize={40mm,18mm},total={52mm,26mm}}
+% eqn 1
+\geometry{papersize={68mm,18mm},total={85mm,26mm}}
+\begin{document}
+\thispagestyle{empty}
+% \[f = \frac{c}{N}\]
+
+% \[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\]
+
+% \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\]
+
+\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\]
+\end{document}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
diff --git a/Lev.hs b/Lev.hs
new file mode 100644
index 0000000..c8c1044
--- /dev/null
+++ b/Lev.hs
@@ -0,0 +1,22 @@
+module Lev where
+import qualified Data.Map as Dct
+import Control.Monad (liftM)
+enum = zip [0..]
+takeFail 0 _ = Just []
+takeFail _ [] = Nothing
+takeFail n (x:xs) = liftM (x:) (takeFail (n - 1) xs)
+window n l = case takeFail n l of
+ Nothing -> []
+ Just xs -> xs : window n (tail l)
+-- _levenshtein :: (Enum b, Num b, Ord b) =>
+ -- [t] -> [a] -> b -> (a -> b, t -> b, a -> t -> b) -> [b]
+_levenshtein ss ts indel (ins,del,sub) =
+ let initial l = [0,indel..indel * fromIntegral (length l)]
+ in foldl (\ table (i,s) ->
+ (foldl (\ row (t,[prev,prev']) ->
+ minimum [ins t + head row, del s + prev', sub s t + prev]
+ : row)
+ [i]
+-- doesn't work without double reverse. i suspect only (window 2) needs reverse
+ (zip ts . window 2 {- . reverse-} $ table)))
+ (initial ts) {-(reverse (initial ts))-} (zip (tail (initial ss)) ss)
diff --git a/Main.hs b/Main.hs
new file mode 100644
index 0000000..ff4c9c5
--- /dev/null
+++ b/Main.hs
@@ -0,0 +1,4 @@
+module Main where
+import qualified Sed
+main :: IO ()
+main = print . Sed.analyse =<< Sed.groupSedInGor
diff --git a/Sed.hs b/Sed.hs
new file mode 100644
index 0000000..f491f7d
--- /dev/null
+++ b/Sed.hs
@@ -0,0 +1,90 @@
+module Sed where
+import Text.Regex.Posix
+import Text.CSV
+import Char
+import qualified Data.Map as Dct
+import Data.List
+import Data.Ord
+import qualified Lev
+--- util ---
+(&) = flip (.)
+dctCollapse xs k v = Dct.fromAscListWith
+ (++)
+ (sortBy (comparing fst) [(k x, [v x]) | x <- xs])
+listExtract ns xs = extract ns xs 0
+ where extract [] _ _ = []
+ extract (n:ns) xs i =
+ xs !! (n - i) : extract ns (drop (n - i) xs) n
+kross (xs,ys) = do x <- xs; y <- ys; return (x,y)
+both f (x,y) = (f x, f y)
+pairs [] = []
+pairs (x:xs) = [(x,y) | y <- xs] ++ pairs xs
+average l = sum l / fromIntegral (length l)
+--- read CSV ---
+segment = head & dropWhile isLower & segmentName
+ where segmentName s = seg++n where (seg,n,_) =
+ s =~ "[0-9]" :: (String,String,String)
+features (title:ns) = (feature title, map read ns)
+ where feature s = feat where (_,_,feat) =
+ s =~ "[0-9]" :: (String,String,String)
+groupWords csv = Dct.map (fillsegments . phones) words
+ where words = dctCollapse (tail csv) (head & takeWhile isLower) id
+ fillsegments = Dct.mapWithKey makesegment
+ phones l = Dct.map Dct.fromList (dctCollapse l segment features)
+makesegment typ d =
+ let size = length . head . Dct.elems $ d
+ in d `Dct.union` Dct.map (replicate size) (featdict Dct.! init typ)
+groupRegions regions words = Dct.map outermost regions
+ where outermost range = Dct.map inner words
+ where inner = Dct.map (Dct.map (listExtract (map ((-) 2) range)))
+groupSedInGor = do
+ csv <- parseCSVFromFile "sed.csv"
+ case csv of
+ Left err -> error ("oh no:" ++ show err)
+ Right rows -> return $ groupRegions regions $ groupWords $ transpose rows
+--- analysis ---
+flatten = map (map Dct.elems . Dct.elems) . Dct.elems
+analyse sed = Dct.fromList . zip edges . map (sedDistance avgregions) $ regions
+ where edges = pairs (Dct.keys sed)
+ regions = pairs (flatten sed)
+ avgregions = average (map sedAvgTotal regions)
+featureSub seg1 seg2 = fromIntegral (Dct.size(seg1 `symmetric_difference` seg2))
+ + sum (map abs (Dct.elems (Dct.intersectionWith (-) seg1 seg2)))
+ where symmetric_difference d e = Dct.union (e `Dct.difference` d)
+ (d `Dct.difference` e)
+sedDistance avg = sum . map (sedLevenshtein avg) . uncurry zip
+transposeWord word = transpose (map transposeSegment word)
+ where transposeSegment seg = map (Dct.fromList . zip (Dct.keys seg))
+ (transpose (Dct.elems seg))
+sedLevenshtein a = average . map (levenshtein a) . kross . both transposeWord
+levenshtein a (w1,w2) =
+ head $ Lev._levenshtein w1 w2 a (const a, const a, featureSub)
+sedAvg :: (Ord k, Fractional a) => ([Dct.Map k [a]], [Dct.Map k [a]]) -> a
+sedAvg = both (concat . transposeWord) & kross & map (uncurry featureSub) & average
+sedAvgTotal (region1,region2) = average (map sedAvg (zip region1 region2)) / 2.0
+--- data ---
+featdict = Dct.fromList [("C", Dct.fromList [("GL",0.0), ("V",0.0), ("H",0.0),
+ ("PV",0.0), ("L",0.0)]),
+ ("V", Dct.fromList [("B",1.0), ("H",1.0),
+ ("L",1.0), ("R",1.0)]),
+ ("R", Dct.fromList [("MN",1.5), ("PL",1.0)]),
+ ("MULT", Dct.fromList [("MULT", 1.0)]),
+ ("VC", Dct.empty)]
+regions :: Dct.Map String [Int]
+regions = Dct.fromList [("ne", [2..11]++[17..23]),
+ ("nw", [11..17]++[23..41]++[77..83]),
+ ("yk", [41..75]),
+ -- ++[75..77] (Isle of Man isn't on GOR map)
+ ("wm", [112..126]++[140..157]),
+ ("em", [83..112]++[126..140]++[157..172]),
+ ("ee", [172..191]++[217..238]),
+ ("se", [206..217]++[262..279]++[302..315]),
+ ("sw", [193..206]++[240..262]++[279..302]),
+ ("ld", [238..240])]
+test = [["", "applV1H", "applV1L", "applC1GL", "applV2", "catcV1H", "askMULT0MULT", "askV1H", "askV1B"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+ ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"]]
diff --git a/nord/RepairTalBanken.hs b/nord/RepairTalBanken.hs
new file mode 100644
index 0000000..6d7db19
--- /dev/null
+++ b/nord/RepairTalBanken.hs
@@ -0,0 +1,56 @@
+------- yet another uncrosser ------------
+example = Node ("S", 555)
+ [Node ("VP",510)
+ [Node ("NP",502)
+ [Leaf ("DET",0) ("Das",0), Leaf ("N",1) ("Buch",1), Leaf ("PRON",4) ("ihm",4)]
+ , Leaf ("V",2) ("gegeben",2)
+ , Leaf ("PART",5) ("habe",5)]
+ , Node ("NP",501) [Leaf ("PRON",3) ("ich",3)]]
+sexample = spanTree example
+spanTree :: Tree (String, Integer) -> Tree (Integer, Integer)
+spanTree (Leaf (_,i) _) = Leaf (i,i+1) (i,i+1)
+spanTree (Node _ kids) = Node (minimum starts, maximum ends) trees
+ where trees = map spanTree kids
+ starts = map (fst . dat) trees
+ ends = map (snd . dat) trees
+uncross' :: [Tree (Integer,Integer)] -> [Tree (Integer,Integer)]
+uncross' [] = []
+uncross' (Leaf a w : siblings) = Leaf a w : uncross' siblings
+uncross' (Node a kids : siblings) = uncross''.both depair.span continuous.pairs
+ $ kids
+ where uncross'' (co,[]) = co ++ uncross' siblings
+ uncross'' (co,disco) = co ++ uncross' (insert siblings disco)
+pairs l = zip l (tail l)
+both f (x,y) = (f x, f y)
+continuous (t, t') = snd (dat t) == fst (dat t')
+depair l = (fst $ head l) : map snd l
+insert = (++) -- insert has to stick disco in siblings somewhere and then uncross
+ -- it all. Not necessarily in that order.
+{-uncross (Node a kids) = Node a (uncross' kids)
+uncross l = l
+uncross' :: [Siblings] -> [Siblings] -- but uncrossed
+-- OK the problem is that insert might need to drop disco down a couple of levels into siblings
+-- in other words, the first step is the check what siblings disco belongs IN or AFTER
+-- then you may have to insert down, ie repeat the insert for the chosen sibling's kids
+-- ... told you there might be a lot of consing!
+insert siblings disco = let (before,actual:after) = splitBy ((lhs disco) >) siblings in
+ if rhs disco > lhs actual then -- or something like this
+ before ++ actual : disco ++ after -- um..you get the idea
+ else
+ before ++ (insert (kids actual) disco : after) -- whoo CONS! -}
+ -- also this recursive step should do some uncrossing of before and after, right?
+
+{- The idea is that you start at the leftmost kid of a Node.
+ You take as much as is continuous and you cons that onto the rest of the siblings+disco
+ after that has all been uncrossed.
+ co : uncross (insert disco siblings)
+ except that uncross has to take additional arguments?
+ also some uncrossings may have to burrow arbitrarily deep. I'm not sure of the limit yet.
+ Anyway you'd have to do it either way, bottom-up or top-down, so at least top-down it's
+ easier to maintain pointers to it all even if there is a lot of consing involved.
+
+ actually now that I sleep on it, I'm not sure about arbitrary deepness. I think that's
+ needed only if you want to try Wolfgang's bottom-up style. Adriane Boyd's split style
+ just attaches the disco part as a sibling to the co part I think.
+-}
+T
diff --git a/nord/TestDistance.hs b/nord/TestDistance.hs
new file mode 100644
index 0000000..d72160b
--- /dev/null
+++ b/nord/TestDistance.hs
@@ -0,0 +1,49 @@
+import Distance hiding (main)
+import Test.QuickCheck.Batch
+import Test.QuickCheck
+import Control.Arrow ((&&&))
+import Util
+import Data.List (group, sort, nub)
+import Data.List.Split (splitOn)
+import qualified Data.Map as Map
+{- instance Arbitrary Char where
+ arbitrary = choose ('\32', '\128')
+ coarbitrary c = variant (ord c `rem` 4) -}
+
+exampleLines = splitOn "\n" "Morgoth\na\nb\n***\na\nc"
+
+prop_histogram_list :: [Int] -> Bool -- oops, this is a test for Util.
+prop_histogram_list l = Map.toList (histogram l) == listhist l
+ where listhist = sort & group & map (head &&& length)
+prop_histogram_empty :: [Int] -> Bool
+prop_histogram_empty l = (l == []) == (histogram l == Map.empty)
+
+prop_r_empty :: Bool
+prop_r_empty = r [] == 0.0
+prop_r_one = r [(1,2)] == 1.0
+prop_r_two = r [(1,2), (10,20)] == 11.0
+
+prop_cmp_truncate :: [Int] -> [Int] -> Property
+prop_cmp_truncate r1 r2 = (r2 /= [] && r1 /= []) ==>
+ length (cmp r1 r2) == length (nub r2)
+prop_cmp_zeroes_r1 :: [Int] -> [Int] -> Property
+prop_cmp_zeroes_r1 r1 r2 = (r2 /= [] && r1 /= []) ==>
+ countBy (/=0.0) (map fst rcompare)
+ == Map.size (histogram r1 `Map.intersection` histogram r2)
+ where rcompare = cmp r1 r2
+prop_cmp_iterate :: [Int] -> [Int] -> Property
+prop_cmp_iterate r1 r2 = (r1 /= [] && r2 /= []) ==> all (\ ((a,b),(c,d)) -> abs (a - b) < abs (c - d)) $ zip (cmp r1 r2) $ map (\ (f,n) -> (fromIntegral $ Map.findWithDefault 0 f (histogram r1), fromIntegral n)) (Map.toList $ histogram r2)
+ where rcompare = cmp r1 r2
+main = runTests
+ "The Basics"
+ TestOptions {no_of_tests = 100
+ ,length_of_tests = 1
+ , debug_tests = False}
+ [run prop_histogram_list
+ , run prop_histogram_empty
+ , run prop_r_empty
+ , run prop_r_one
+ , run prop_r_two
+ , run prop_cmp_truncate
+ , run prop_cmp_zeroes_r1
+ , run prop_cmp_iterate]
diff --git a/prg-pres.tex b/prg-pres.tex
new file mode 100644
index 0000000..715b5c7
--- /dev/null
+++ b/prg-pres.tex
@@ -0,0 +1,255 @@
+\documentclass{beamer}
+
+\usetheme{Ilmenau}
+\usepackage[all]{xy}
+
+\title{Syntax Distance for Dialectometry \\ Progress Report : Parsing
+ Swedish Badly}
+\author{Nathan Sanders}
+\date{\today}
+
+\begin{document}
+
+\begin{frame}
+ \frametitle{Welcome to Jones!}
+
+ \begin{itemize}
+ \item Try Haskell, a Very Nice Functional Language!
+ \item We have met the enemy and he is us.
+ \item Who would win? Henry VIII or George III?
+ \item Only the pure of heart can remove the sword from the stone.
+ \item With great powder comes great rapidity.
+ \item 640K should be enough for anyone.
+ \item A spoonful of sugar makes the medicine go down.
+ \item Try O'Caml, a Very Nice Functional Language!
+ \item Try Scala, a Very Nice Functional Language!
+ \item Try Nice, a Very Clean Functional Language!
+ \item Try Clean, a Very Nice Functional Language!
+ \item Try Nemerle, a Very Nice Functional Language!
+ \item Try Miranda\texttrademark, a Functional Language!
+ \item Try F\#, a Very Nice Functional Language!
+ \item Try Scheme, a Very Nice Functional Language!
+ \item Try Clojure, a Very Nice Functional Language!
+ \end{itemize}
+\end{frame}
+
+\frame{\titlepage}
+\section{Experiment}
+\begin{frame}
+ \frametitle{Experiment}
+ Find syntactic differences between dialects of Swedish
+ \begin{enumerate}
+ \item Corpus: Swediasyn, unparsed interviews transcribed and glossed
+ to standard Swedish
+ \item Training corpus: Talbanken, 300 Kwords of parsed spoken and
+ written Swedish from the late 70s.
+ \item Annotators: TnT, MaltParser and the Berkeley parser, all
+ trained on Talbanken.
+ \end{enumerate}
+\end{frame}
+\section{Prototype}
+\begin{frame}
+ \frametitle{Convert Talbanken}
+ From TIGER-XML, a flat XML representation in ISO-8859-1 (Latin1)
+ \begin{itemize}
+ \item To TnT POS (word --- tab --- POS)
+ \item To PTB (nested s-exps, which are kind of annoying to produce
+ from TIGER-XML)
+ \item Swediasyn is UTF-8, and all the Java-based parsers require it,
+ so Talbanken must be converted.
+ \end{itemize}
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{Talbanken Example}
+\begin{verbatim}
+
+
+
+
+
+ ...
+
+
+
+
+
+
+ ...
+
+\end{verbatim}
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{TnT Example}
+\begin{verbatim}
+ja POPPHH
+tycker VVPS
+för PR
+(imagine f\:or above, pdflatex doesn't like UTF-8 either)
+min POXPHHGG
+del NN
+\end{verbatim}
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{PTB Example}
+\begin{verbatim}
+(ROOT (S
+ (POPPHH ja)
+ (VVPS tycker)
+ (PP (PR för) (POXPHHGG min) (NN del))
+ (S
+ (XP (UKAT att) (UKAT att))
+ (RJ >>)
+ (EHRJPU >>)
+ (POOP de)
+ (AVPS e)
+ (NP (P P (ABZA för) (ABZA mycke) (PR av))
+ (EN en)
+ (NN monolog)
+ (S (PORP som) (NNDDHH prästen) (VVPS håller))))))
+\end{verbatim}
+\end{frame}
+\begin{frame}
+ \frametitle{T'n'T}
+ \begin{itemize}
+ \item Training: No problems yet, default parameters
+ \item Classification: Low vocabulary overlap between Talbanken and
+ dialects of Swediasyn. (around 15\% unknown tokens, more for small
+ corpora)
+ \item There were more before converting Talbanken to UTF-8.
+ \end{itemize}
+\end{frame}
+\begin{frame}
+ \frametitle{MaltParser}
+ \begin{itemize}
+ \item Training: Already done and downloadable. Thanks Joakim!
+ \item Conversion: POS-tagged Swedia to ConLL is trivial because ConLL is a flat
+ format.
+ \item Classification: Default parameters, results don't appear to be very good.
+ \end{itemize}
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{Example}
+\begin{columns}
+\column[c]{0.7\textwidth}
+\begin{tabular}{ccccc}
+1& varit& AVSN& 0& ROOT\\
+2& v\"aldigt& AJ& 3& AA \\
+3& intresserad& AJ& 7& SS\\
+4& av& PR& 3& ET \\
+5& det& PODP& 4& PA \\
+6& h\"ar& ID& 5& HD \\
+7& \aa{}ka& VVIV& 0& ROOT\\
+8& ikring& PR& 0& ROOT\\
+9& och& ++OC& 0& ROOT\\
+10& titta& VVIV& 0& ROOT\\
+11& p\aa{}& PR& 0& ROOT\\
+12& platser& NN& 11& PA \\
+13& . & IP& 1& IP \\
+\end{tabular}
+\column[c]{0.3\textwidth}
+\textit{Been interested in going around there and looking at
+places}. (translate.google.com)
+\end{columns}
+\end{frame}
+\begin{frame}
+ \frametitle{Berkeley Parser}
+ \begin{itemize}
+ \item Training: Takes 2 GB. No more, no less. So I used
+ banks (Don't tell Josh).
+ \item Classification: Default parameters, I haven't looked at
+ results closely. Takes about 3 days, but less than 1 GB.
+ \end{itemize}
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{Example}
+\begin{columns}
+\column[c]{0.6\textwidth}
+\begin{verbatim}
+(S (VVPS AVSN)
+ (VNDD AJ)
+ (VVIV AJ)
+ (CNP (NNDD PR)
+ (NNDD PODP)
+ (PN__HH ID)
+ (NNDDHHGG VVIV))
+ (VN__SS PR)
+ (VVPS ++OC)
+ (NP (NP (NN VVIV)
+ (PN__HH PR))
+ (PN__HH NN)
+ (ID IP)))
+\end{verbatim}
+\column[c]{0.4\textwidth}
+(varit AVN) \\
+(v\"a{}ldigt AJ) \\
+(intresserad AJ) \\
+(av PR) \\
+(det PODP) \\
+(h\"ar ID) \\
+(\aa{}ka VVIV) \\
+(ikring PR) \\
+(och ++OC) \\
+(titta VVIV) \\
+(p\aa{} PR) \\
+(platser NN)\\
+(. IP)
+\end{columns}
+\end{frame}
+\begin{frame}
+ \frametitle{Features}
+ \begin{itemize}
+ \item Trigrams: Trivial, same as before.
+ \item Leaf-ancestors: Same as before, except now in Haskell.
+ \item Dependency paths: for each leaf, record the path to the root.
+ \end{itemize}
+\end{frame}
+\begin{frame}
+ \frametitle{Leaf-Ancestor Paths}
+
+\begin{columns}
+\column[c]{0.5\textwidth}
+\[\xymatrix{
+ &&\textrm{S} \ar@{-}[dl] \ar@{-}[dr] &&\\
+ &\textrm{NP} \ar@{-}[d] \ar@{-}[dl] &&\textrm{VP} \ar@{-}[d]\\
+ \textrm{Det} \ar@{-}[d] & \textrm{N} \ar@{-}[d] && \textrm{V} \ar@{-}[d] \\
+\textrm{the}& \textrm{dog} && \textrm{barks}\\}
+\]
+\column[c]{0.5\textwidth}
+\begin{itemize}
+\item S-NP-Det-The
+\item S-NP-N-dog
+\item S-VP-V-barks
+\end{itemize}
+\end{columns}
+
+\end{frame}
+\begin{frame}
+ \frametitle{Dependency Paths}
+
+\begin{columns}
+\column[c]{0.5\textwidth}
+\[\xymatrix{
+& & root \\
+DET \ar@/^/[r] & NP\ar@/^/[r] & V \ar@{.>}[u] \\
+The & dog & barks
+}
+\]
+\column[c]{0.5\textwidth}
+\begin{itemize}
+\item root-V-N-Det-the
+\item root-V-N-dog
+\item root-V-barks
+\end{itemize}
+\end{columns}
+
+\end{frame}
+\begin{frame}[fragile]
+ \frametitle{Distance}
+ \begin{itemize}
+ \item g++ -O2
+ \item Same as before: \verb+r = map (uncurry (-) & abs) & sum+
+ \item Significance test is only 100 iterations, down from 1000.
+ \item May be ghc -O2 soon.
+ \end{itemize}
+\end{frame}
+\end{document}
diff --git a/sanders_acl07.tex b/sanders_acl07.tex
new file mode 100644
index 0000000..439a81a
--- /dev/null
+++ b/sanders_acl07.tex
@@ -0,0 +1,521 @@
+\documentclass[11pt,letterpaper]{article}
+\pdfpagewidth=\paperwidth
+\pdfpageheight=\paperheight
+\usepackage{times}
+\usepackage{latexsym}
+\usepackage{acl07}
+\usepackage[all]{xy}
+\author{\textbf{Nathan C. Sanders} \\ Department of Linguistics \\
+ Indiana University \\
+ Bloomington, IN 47405, USA \\ \texttt{ncsander@indiana.edu}}
+\title{Measuring Syntactic Difference in British English}
+\begin{document}
+\maketitle
+%TODO:
+% 7. Check results of r^2 test and write them into the poster
+% 9. *Maybe* real-language examples for methods. But if you have trouble with
+% abc/xyz how do you handle REAL CS?! Or theoretical syntax? Or PERL?!?
+\begin{abstract}Recent work by
+ \newcite{nerbonne06} has provided a foundation for measuring
+ syntactic differences between corpora. It uses part-of-speech trigrams as an
+ approximation to syntactic structure, comparing the trigrams of two
+ corpora for statistically significant differences.
+
+ This paper extends the method and its application. It extends the
+ method by using leaf-path ancestors of \newcite{sampson00} instead of
+ trigrams, which capture internal syntactic structure---every leaf in
+ a parse tree records the path back to the root.
+
+ The corpus used for testing is the International Corpus of English,
+ Great Britain \cite{nelson02}, which contains syntactically
+ annotated speech of Great Britain. The speakers are grouped into
+ geographical regions based on place of birth. This is different in
+ both nature and number than previous experiments, which found
+ differences between two groups of Norwegian L2 learners of
+ English. We show that dialectal variation in eleven British regions from the ICE-GB
+ is detectable by our algorithm, using both leaf-ancestor paths and trigrams.
+\end{abstract}
+
+\section{Introduction}
+In the measurement of linguistic distance, older work such as
+\newcite{seguy73} was able to measure distance in most areas of
+linguistics, such as phonology, morphology, and syntax. The
+features used for comparison were hand-picked based on
+linguistic knowledge of the area being surveyed. These features,
+while probably lacking in completeness of coverage, certainly allowed
+a rough comparison of distance in all linguistic domains.
+In contrast, computational methods have
+focused on a single area of language. For example, a method for
+determining phonetic distance is given by \newcite{heeringa04}. Heeringa
+and others have also done related work on phonological distance in
+\newcite{nerbonne97} and \newcite{gooskens04}. A measure of syntactic
+distance is the obvious next step: \newcite{nerbonne06} provide one such
+method. This method approximates internal syntactic structure using
+vectors of part-of-speech trigrams. The trigram types can then be
+compared for statistically significant differences using a permutation
+test.
+
+This study can be extended in a few ways.
+First, the trigram approximation works well, but it
+does not necessarily capture all the information of syntactic
+structure such as long-distance movement. Second,
+the experiments did not test data for geographical dialect variation,
+but compared two generations of Norwegian L2 learners of English, with
+differences between ages of initial acquisition.
+
+We address these areas by using the syntactically annotated speech
+section of the International Corpus of English, Great Britain (ICE-GB)
+\cite{nelson02}, which provides a corpus with full syntactic annotations,
+one that can be divided into groups for comparison. The sentences
+of the corpus, being represented as parse trees rather than a vector
+of POS tags, are
+converted into a vector of leaf-ancestor paths, which were developed
+by \newcite{sampson00} to aid in parser evaluation by providing a way to
+compare gold-standard trees with parser output trees.
+
+In this way, each sentence produces its own vector of leaf-ancestor
+paths. Fortunately, the
+permutation test used by \newcite{nerbonne06} is already designed to
+normalize the effects of differing sentence length when combining POS
+trigrams into a single vector per region. The only change needed is
+the substitution of leaf-ancestor paths for trigrams.
+
+The speakers in the ICE-GB are divided by place of birth into
+geographical regions of
+England based on the nine Government Office Regions, plus Scotland and
+Wales. The
+average region contains a little over 4,000
+sentences and 40,000 words. This is less than the size of the
+Norwegian corpora, and leaf-ancestor paths are more
+complex than trigrams, meaning that the amount of data required for
+obtaining significance should increase. Testing on smaller corpora
+should quickly show whether corpus size can be reduced without losing
+the ability to detect differences.
+
+Experimental results show that differences can be detected among the
+larger regions: as should be expected with a method
+that measures statistical significance, larger corpora allow easier
+detection of significance. The limit seems to be around 250,000 words for
+leaf-ancestor paths, and 100,000 words for POS trigrams, but more careful
+tests are needed to verify this.
+Comparisons to judgments of dialectologists have not yet
+been made. The comparison is difficult because of the
+difference in methodology and amount of detail in
+reporting. Dialectology tends to collect data from a few informants
+at each location and to provide a more complex account of relationship
+than the like/unlike judgments provided by permutation tests.
+
+\section{Methods}
+
+The methods used to implement the syntactic difference test come from two
+sources. The primary source is the syntactic comparison of
+\newcite{nerbonne06}, which uses a permutation test, explained in
+\newcite{good95} and in particular for linguistic purposes in
+\newcite{kessler01}. Their permutation test
+collects POS trigrams from a random subcorpus of sentences
+sampled from the combined corpora. The trigram frequencies are
+normalized to neutralize the
+effects of sentence length, then compared to the
+trigram frequencies of the complete corpora.
+% \cite{nerbonne06} compare
+% two generations of Norwegian L2 learners of English.
+
+The principal difference between the work of \newcite{nerbonne06} and ours is
+the use of leaf-ancestor paths.
+Leaf-ancestor paths were developed by \newcite{sampson00} for
+estimating parser performance by providing a measure of similarity of
+two trees, in particular a gold-standard tree and a machine-parsed
+tree. This distance is not used for our method, since for our purposes,
+it is enough that leaf-ancestor paths represent syntactic information, such as
+upper-level tree structure, more explicitly than trigrams.
+
+The permutation test used by \newcite{nerbonne06} is independent of the
+type of item whose frequency is measured, treating the items as atomic
+symbols. Therefore, leaf-ancestor paths should do just as well as
+trigrams as long as they do not introduce any additional constraints
+% constraints should=> statistical anomalies!
+on how they are generated from the corpus. Fortunately, this is not
+the case; \newcite{nerbonne06} generate $N-2$ POS trigrams from each
+sentence of length $N$; we generate $N$ leaf-ancestor paths from each
+parsed sentence in the corpus. Normalization is needed to account for
+the frequency differences caused by sentence length variation; it is
+presented below. Since the same number (minus two) of trigrams and
+leaf-ancestor paths are generated for each sentence the same
+normalization can be used for both methods.
+
+\subsection{Leaf-Ancestor Paths}
+
+Sampson's leaf-ancestor paths represent syntactic structure
+by aggregating nodes starting from each leaf and proceeding up to
+the root---for our experiment, the leaves are parts of speech.
+This maintains constant input from
+the lexical items of the sentence, while giving the parse tree some
+weight in the representation.
+
+For example, the parse tree
+\[\xymatrix{
+ &&\textrm{S} \ar@{-}[dl] \ar@{-}[dr] &&\\
+ &\textrm{NP} \ar@{-}[d] \ar@{-}[dl] &&\textrm{VP} \ar@{-}[d]\\
+ \textrm{Det} \ar@{-}[d] & \textrm{N} \ar@{-}[d] && \textrm{V} \ar@{-}[d] \\
+\textrm{the}& \textrm{dog} && \textrm{barks}\\}
+\]
+creates the following leaf-ancestor paths:
+
+\begin{itemize}
+\item S-NP-Det-The
+\item S-NP-N-dog
+\item S-VP-V-barks
+\end{itemize}
+
+There is one path for each word, and the root appears
+in all four. However, there can be ambiguities if some
+node happens
+to have identical siblings. Sampson gives the example
+of the two trees
+\[\xymatrix{
+ &&\textrm{A} \ar@{-}[dl] \ar@{-}[dr] &&&\\
+ &\textrm{B} \ar@{-}[d] \ar@{-}[dl] &&\textrm{B} \ar@{-}[d] \ar@{-}[dr] & \\
+ \textrm{p} & \textrm{q} && \textrm{r} & \textrm{s} \\
+}
+\]
+and
+\[\xymatrix{
+ &&\textrm{A} \ar@{-}[d] &&&\\
+ &&\textrm{B} \ar@{-}[dll] \ar@{-}[dl] \ar@{-}[dr] \ar@{-}[drr]&&& \\
+ \textrm{p} & \textrm{q} && \textrm{r} & \textrm{s} \\
+}
+\]
+which would both produce
+
+ \begin{itemize}
+ \item A-B-p
+ \item A-B-q
+ \item A-B-r
+ \item A-B-s
+ \end{itemize}
+
+ There is no way to tell from the paths which leaves belong to which
+ B node in the first tree, and there is no way to tell the paths of
+ the two trees apart despite their different structure. To avoid this
+ ambiguity, Sampson uses a bracketing system; brackets are inserted
+ at appropriate points to produce
+ \begin{itemize}
+ \item $[$A-B-p
+ \item A-B]-q
+ \item A-[B-r
+ \item A]-B-s
+ \end{itemize}
+and
+ \begin{itemize}
+ \item $[$A-B-p
+ \item A-B-q
+ \item A-B-r
+ \item A]-B-s
+ \end{itemize}
+
+Left and right brackets are inserted: at most one
+in every path. A left bracket is inserted in a path containing a leaf
+that is a leftmost sibling and a right bracket is inserted in a path
+containing a leaf that is a rightmost sibling. The bracket is inserted
+at the highest node for which the leaf is leftmost or rightmost.
+
+It is a good exercise to derive the bracketing of the previous two trees in detail.
+In the first tree, with two B
+siblings, the first path is A-B-p. Since $p$ is a leftmost child,
+a left bracket must be inserted, at the root in this case. The
+resulting path is [A-B-p. The next leaf, $q$, is rightmost, so a right
+bracket must be inserted. The highest node for which it is rightmost
+is B, because the rightmost leaf of A is $s$. The resulting path is
+A-B]-q. Contrast this with the path for $q$ in the second tree; here $q$
+is not rightmost, so no bracket is inserted and the resulting path is
+A-B-q. $r$ is in almost the same position as $q$, but reversed: it is the
+leftmost, and the right B is the highest node for which it is the
+leftmost, producing A-[B-r. Finally, since $s$ is the rightmost leaf of
+the entire sentence, the right bracket appears after A: A]-B-s.
+
+At this point, the alert reader will have
+noticed that both a left bracket and right bracket can be inserted for
+a leaf with no siblings since it is both leftmost and rightmost. That is,
+a path with two brackets on the same node could be produced: A-[B]-c. Because
+of this redundancy, single children are
+excluded by the bracket markup algorithm. There is still
+no ambiguity between two single leaves and a single node with two
+leaves because only the second case will receive brackets.
+
+% See for yourself:
+% \[\xymatrix{
+% &\textrm{A} \ar@{-}[dl] \ar@{-}[dr] &\\
+% \textrm{B} \ar@{-}[d] &&\textrm{B} \ar@{-}[d] \\
+% \textrm{p} && \textrm{q} \\
+% }
+% \]
+
+% \[\xymatrix{
+% &\textrm{A} \ar@{-}[d] &\\
+% &\textrm{B} \ar@{-}[dl] \ar@{-}[dr] & \\
+% \textrm{p} && \textrm{q} \\
+% }
+% \]
+
+% \newcite{sampson00} also gives a method for comparing paths to obtain an
+% individual path-to-path distance, but this is not necessary for the
+% permutation test, which treats paths as opaque symbols.
+
+\subsection{Permutation Significance Test}
+
+With the paths of each sentence generated from the corpus, then sorted
+by type into vectors, we now try to determine
+whether the paths of one region occur in significantly different
+numbers from the paths of another region. To do this, we calculate some
+measure to characterize the difference between two vectors as a single
+number. \newcite{kessler01} creates a simple measure called the
+{\sc Recurrence} metric ($R$ hereafter), which
+is simply the sum of absolute differences of all path token counts
+$c_{ai}$ from the first corpus $A$ and $c_{bi}$ from the second corpus
+$B$.
+\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\]
+However, to find out if the value of $R$ is significant, we
+must use a permutation test with a Monte Carlo technique described by
+\newcite{good95}, following
+closely the same usage by \newcite{nerbonne06}. The intuition behind
+the technique is to compare the $R$ of the two corpora with the $R$ of
+two random subsets of the combined corpora. If the random subsets' $R$s
+are greater than the $R$ of the two actual corpora more than $p$ percent
+of the time, then we can reject the null hypothesis that the two were
+are actually drawn from the same corpus: that is, we can assume that
+the two corpora are different.
+
+However, before the $R$ values can be compared, the path counts in the
+random subsets must
+be normalized since not all paths will occur in every subset, and
+average sentence length will differ, causing relative path frequency
+to vary. There are two normalizations that must occur: normalization
+with respect to sentence length, and
+normalization with respect to other paths within a subset.
+
+The first stage of normalization normalizes the counts for each path
+within the pair of vectors $a$ and $b$. The purpose is to neutralize the
+difference in sentence length, in which longer sentences with more
+words cause paths to be relatively less frequent.
+Each count is converted to a frequency $f$ \[f=\frac{c}{N} \] where
+$c$ is either $c_{ai}$ or $c_{bi}$ from above and $N$ is the length of the
+containing vector $a$ or $b$. This produces two frequencies, $f_{ai}$ and
+$f_{bi}$.Then the frequency is scaled
+back up to a redistributed count by the equation
+\[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\]
+This will redistribute the total of a pair from $a$ and $b$ based on
+their relative frequencies. In other words, the total of each path
+type $c_{ai} + c_{bi}$ will remain the same, but the values of
+$c_{ai}$ and $c_{bi}$ will be balanced by their frequency
+within their respective vectors.
+
+For example, assume that the two corpora have 10 sentences each, with
+a corpus $a$ with only 40 words and another, $b$, with 100 words. This
+results in $N_a = 40$ and $N_b = 100$. Assume also that there is a
+path $i$ that occurs in both: $c_{ai} = 8$ in $a$ and $c_{bi} = 10$ in
+$b$. This means that the relative frequencies are $f_{ai} = 8/40 = 0.2$
+and $f_{bi} = 10/100 = 0.1$. The first normalization will redistribute the
+total count (18) according to relative size of the frequencies. So
+\[c_{ai}' = \frac{0.2(18)}{0.2+0.1} = 3.6 / 0.3 = 12\] and
+\[c_{bi}' = \frac{0.1(18)}{0.2+0.1} = 1.8 / 0.3 = 6\]
+Now that 8 has been scaled to 12 and 10 to 6, the effect of sentence length
+has been neutralized. This reflects the intuition that something that
+occurs 8 of 40 times is more important than something that occurs 10
+of 100 times.
+
+% this is the (*2n) / N bit
+The second normalization normalizes all values in both
+permutations with respect to each other. This is simple: find the
+average number of times each path appears, then divide each scaled
+count by it. This produces numbers whose average is 1.0 and whose
+values are multiples of the amount that they are greater than the average.
+The average path
+count is $N / 2n$, where $N$ is the number of path tokens in
+both the permutations and $n$ is the number of path types. Division by
+two is necessary since we are multiplying counts from a single permutation by
+token counts from both permutations. Each type entry in the
+vector now becomes \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\]
+
+Starting from the previous example, this second normalization first
+finds the average. Assuming 5 unique paths (types) for $a$ and 30 for
+$b$ gives \[n = 5 + 30 = 35\] and
+\[N = N_a + N_b = 40 + 100 = 140\]
+Therefore, the average path type has $140 / 2(35) = 2$
+tokens in $a$ and $b$ respectively. Dividing $c_{ai}'$ and $c_{bi}'$ by this average gives $s_{ai} = 6$
+and $s_{bi} = 3$. In other words, $s_{ai}$ has 6 times more tokens
+than the average path type.
+
+%% This is definitely the code at the end of normaliseall
+% The third stage of normalization normalizes all subset corpora with
+% respect to each other. This process has the same effect as the second stage of
+% normalization, but carried out over all subsets:
+% each path type is averaged over every corpus; then each type is divided
+% by that average. The final result is the same as the second normalization:
+% each corpus consists of multiples of an average of $1.0$. The
+% difference is that after the second normalization all types within a
+% single subset corpus vary around an average of 1.0. After the third
+% normalization, the values for a single type vary around an average of
+% 1.0 across all subsets.
+
+% For example,
+% assume that $s_a = 6$ and $s_b = 3$ from the previous example are
+% members of a subset corpus $C_1$ and that another corpus $C_2$
+% has values $ss_a = 2$ and $ss_b = 3$ for the same path. Assume this
+% path type has an average of 4 across all subsets. This gives a
+% normalized value of $s'_a = 1.5$ and
+% $ss'_a = 0.5$. Doing the same operation with an average $S_b = 3$ gives
+% normalized values $s'_b = 1$ and $ss'_b = 1$. Remember, $s_a = 6$ means
+% that $s_a$ appears 6 times more than the average path in its
+% subset corpus. After the last normalization, $s_a = 1.5$ means that 6 is
+% 1.5 times more than the average of this path across all
+% subsets. Conversely, $ss_a$'s 2 times is only 0.5 the average for
+% this path.
+
+\section{Experiment and Results}
+
+The experiment was run on the syntactically annotated part of the
+International Corpus of English, Great Britain corpus (ICE-GB).
+The syntactic annotation labels terminals with one of twenty parts of
+speech and internal nodes with a category and a function
+marker. Therefore, the leaf-ancestor paths each started at the root of
+the sentence and ended with a part of speech.
+For comparison to the experiment conducted by \newcite{nerbonne06}, the
+experiment was also run with POS trigrams. Finally, a control
+experiment was conducted by comparing two permutations from the same
+corpus and ensuring that they were not significantly different.
+
+ICE-GB reports the place of birth of each speaker, which is the best
+available approximation to which dialect a speaker uses. As a simple,
+objective partitioning, the speakers were divided into 11 geographical
+regions based on the 9 Government Office Regions of England with Wales
+and Scotland added as single regions. Some speakers had to be thrown
+out at this point because they lacked birthplace information or were
+born outside the UK. Each region varied in size; however, the average
+number of sentences per corpus was 4682, with an average of 44,726
+words per corpus (see table \ref{size}). Thus, the average
+sentence length was 9.55 words. The average corpus was smaller than
+the Norwegian L2 English corpora of \newcite{nerbonne06}, which had two
+groups, one with 221,000 words and the other with 84,000.
+
+% NW=NW
+% NE=Northumbria
+% Yorkshire=Yorkshire/Humber
+% East=East Anglia
+% London=London
+% Southeast=SE
+% Southwest=West
+% East-Midlands~=Middle England (except I think bigger)
+% West-Midlands~=Heart of England (except smaller--nothing to S or E)
+\begin{table}
+\begin{tabular}{|lcc|} \hline
+Region & sentences & words \\
+\hline \hline
+ East England & 855 & 10471 \\ \hline
+ East Midlands & 1944 & 16924 \\ \hline
+ London & 24836& 244341 \\ \hline
+ Northwest England & 3219 & 27070 \\ \hline
+ Northeast England & 1012 & 10199 \\ \hline
+ Scotland & 2886 & 27198 \\ \hline
+ Southeast England & 11090 & 88915 \\ \hline
+ Southwest England & 939 & 7107 \\ \hline
+ West Midlands & 960 & 12670 \\ \hline
+ Wales & 2338 & 27911 \\ \hline
+ Yorkshire & 1427 & 19092 \\ \hline
+\end{tabular}
+\caption{Subcorpus size}
+\label{size}
+\end{table}
+%Priscilla Rasmussen, ACL, 209 N. Eighth Street, Stroudsburg, PA 18360
+
+\begin{table}
+ \begin{tabular}{|c|c|} \hline % or use * ** *** notation
+ Region & Significantly different ($p < 0.05$) \\ \hline
+ London & East Midlands, NW England \\
+ & SE England, Scotland \\ \hline
+ SE England & Scotland \\ \hline
+ \end{tabular}
+\caption{Significant differences, leaf-ancestor paths}
+ \label{diffs}
+\end{table}
+
+Significant differences (at $p < 0.05$) were found when
+comparing the largest regions, but no significant differences were
+found when comparing small regions to other small regions. The
+significant differences found are given in table \ref{diffs} and
+\ref{trigramdiffs}. It seems that summed corpus size must reach a
+certain threshold before differences can be observed reliably: about 250,000
+words for leaf-ancestor paths and 100,000 for trigrams. There are exceptions in
+both directions; the total size of London compared to Wales is larger than
+the size of London
+compared to the East Midlands, but the former is not statistically different.
+On the other hand, the total size of Southeast England compared to
+Scotland is only half of the other significantly different comparisons; this
+difference may be a result of
+more extreme syntactic differences than the other areas.
+Finally, it is interesting to note that the summed Norwegian corpus
+size is around 305,000 words, which is about three times the size needed
+for significance as estimated from the ICE-GB data.
+
+\begin{table}
+ \begin{tabular}{|c|c|} \hline % or use * ** *** notation
+ Region & Significantly different ($p < 0.05$) \\ \hline
+ London & East Midlands, NW England, \\
+ & NE England, SE England,\\
+ & Scotland, Wales \\ \hline
+ SE England & London, East Midlands, \\
+ & NW England, Scotland \\ \hline
+ Scotland & London, SE England, Yorkshire \\ \hline
+ \end{tabular}
+ \label{trigramdiffs}
+\caption{Significant differences, POS trigrams}
+\end{table}
+
+\section{Discussion}
+
+Our work extends that of \newcite{nerbonne06} in a number of ways. We
+have shown that an alternate method of representing syntax still
+allows the permutation test to find significant differences between
+corpora. In addition, we have shown differences between corpora divided
+by geographical area rather than language proficiency, with many more
+corpora than before. Finally, we have shown that the size of the
+corpus can be reduced somewhat and still obtain significant results.
+
+Furthermore, we also have shown that both leaf-ancestor paths and POS
+trigrams give similar results, although the more complex paths require more data.
+
+However, there are a number of directions that this experiment should
+be extended. A comparison that divides the speakers into traditional
+British dialect areas is needed to see if the same differences can be
+detected. This is very likely, because corpus divisions that better
+reflect reality have a better chance of achieving a significant difference.
+
+In fact, even though leaf-ancestor paths should provide finer
+distinctions than trigrams and thus require more data for detectable
+significance, the regional corpora presented here were smaller than
+the Norwegian speakers' corpora in \newcite{nerbonne06} by up to a factor of
+10. This raises the question of a lower limit on corpus size. Our
+experiment suggests that the two corpora must have at least 250,000 words,
+although we suspect that better divisions will allow smaller corpus sizes.
+
+While we are reducing corpus size, we might as well compare the
+increasing numbers of smaller and smaller corpora in an advantageous
+order. It should be possible to cluster corpora by the point at which
+they fail to achieve a significant difference when split from a
+larger corpus. In this way, regions could be
+grouped by their detectable boundaries, not a priori distinctions
+based on geography or existing knowledge of dialect boundaries.
+
+Of course this indirect method would not be needed if one had a direct
+method for clustering speakers, by distance or other
+measure. Development of such a method is worthwhile research for the future.
+
+%THEND
+
+\bibliographystyle{acl}
+\bibliography{central}
+\end{document}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
diff --git a/scrumph.org b/scrumph.org
new file mode 100644
index 0000000..e22f391
--- /dev/null
+++ b/scrumph.org
@@ -0,0 +1,64 @@
+* Scrumph : Internal Scrum
+** 2009/12/2
+ - I re-arranged hypotheses. I marked up the methods intro with
+ section numbers.
+ - I will rewrite the headings of the methods section. I will
+ re-arrange methods intro so the section numbers look less
+ stupid. I will mail Sandra to set up a time on Friday to talk
+ about the new methods sections.
+ - Hypotheses section still sucks pretty bad. There is a lot of noise
+ text left to be excised before giving a draft to Sandra.
+** 2009/12/3
+ - I rewrote the headings of the method sections. I re-arranged the
+ methods intro so that at least the section 3 references are in
+ order (though nested within section 5 references). I mailed
+ Sandra and set up a time around 1:30. Maybe earlier; I will
+ probably go to the reading group.
+ - I will write stubs for unfinished method sections. I will rewrite
+ the hypotheses section to remove the noise. I will clean up the
+ whole thing for noise and send a copy to Sandra.
+** 2009/12/4
+ - I wrote stubs for the unfinished method sections. I rewrote the
+ hypotheses sections to remove noise, along with the whole thing,
+ and sent a copy to Sandra.
+ - I will meet with Sandra to ask her about the appropriateness of
+ the new subsections. I will expand the ones that I keep as much
+ as I can, then do additional research to find out what to put in
+ the others.
+ - I have a ton of errands to run. Probably I should turn in my
+ Swedish book soon and try to find one that actually has
+ linguists in mind.
+** 2009/12/7
+ - I met with Sandra, she gave me some advice. I did research on
+ kinds of backoff and wrote up a couple of the sections.
+ - I will finish research for alternate distance measures section,
+ write up all sections, and maybe start making them sound good.
+ -
+** 2009/12/8
+ - I finished research alternate distance measures section and wrote
+ up all sections. None of them sound particularly good.
+ - I will make all sections sound good.
+ - Some of the section sstill need a little research and some
+ citation (textbooks, mainly, though)
+** 2009/12/9
+ - I made all the sections sound good, except for the last sentence
+ of each one. ugh. I added citations from the appropriate papers
+ where they were missing.
+ - I will double-check the last stupid-sounding sentences and
+ re-read the whole methods section, then send to Sandra. The rest
+ of the day I will work on converting to git (installing on
+ peregrin if needed), resolving unicode problems in Consts.hs and
+ investigate the lexicalisation of the Berkeley parser.
+** 2009/12/10
+ - I added more than I thought I'd have to to the proposal, then
+ sent it off to Sandra. I switched to the more reliable way of
+ storing non-standard diacritic sequences in Haskell in Consts.hs.
+ - I will start testing the build process with tagPos, because it
+ calls Swedia.extractTnt, which I'm working on. I will verify that
+ both the Python and Haskell versions reproduce all the relevant
+ words from the interviews.
+ - Lexicalisation of the Berkeley parser (trainCfg) is delayed until
+ testing tagPos and tagDep are tested.
+* TODO Analyse and make up todos based on notes from last talk
+* TODO Test each step of build.py to make sure it produces the correct output. Start with Swedia.hs I guess
+* TODO Compare output of Swedia.hs and swedia.py. Haskell port doesn't look right yet.
diff --git a/test_sed.py b/test_sed.py
new file mode 100644
index 0000000..a166901
--- /dev/null
+++ b/test_sed.py
@@ -0,0 +1,285 @@
+## from unittest import TestCase, TestSuite, TestLoader, TextTestRunner
+import tester
+from util.lst import *
+from sed import *
+def checktype(f):
+ args,res = typ(f).split("->")
+ return res.strip()==typ(f(*map(mak,args.strip().split("*"))))
+### data ###
+def testRegions(self):
+ # 1. all of the values are in the range 315
+ test(set(concat(regions.values())) - set(range(1,315)), set())
+ # 2. the only skipped rows are Isle of Man and region 23
+ # (which is now in Wales), plus row 1 which holds the word titles
+ test(set(range(1,315)) ^ set(concat(regions.values())),
+ set([1, 75, 76, 191, 192]))
+ #TODO: Check that each region has the correct number of sites.
+ # London has apparently only two regions here.
+ # Also check correlation between similarity and number of sites.
+ # This needs to be normalised for
+ test(dct.map(len, regions),
+ dict(zip(['em', 'ld', 'ee', 'wm', 'sw', 'yk', 'ne', 'se', 'nw'],
+ [ 58, 2, 40, 31, 58, 34, 15, 41, 30])))
+### read CSV ###
+ # extract, curried, takewhile, dropwhile, split_by ought all to be
+ # added to util. Since str is not [char] in Python, there need to be
+ # two versions of takewhile and dropwhile. This is necessary in Scheme
+ # for the same reason.
+ # extract also needs two versions for most languages
+def testLstExtract(self):
+ test(lst_extract([], []), [])
+ test(lst_extract(range(10), []), [])
+ # this version allows multiple identical values
+ test(lst_extract(range(10), [0,0,0]), [0,0,0])
+ test(lst_extract(range(10), [0,1,2]), [0,1,2])
+ # and out-of-order values
+ test(lst_extract(range(10), [2,0,1]), [2,0,1])
+ test(lst_extract(range(10), range(10)), range(10))
+ test(lst_extract(range(10), reversed(range(10))),
+ list(reversed(range(10))))
+ # but this ability may be superfluous in a portable, efficient version
+def testCurried(self):
+ # trivial
+ f = lambda: 12
+ test(f(), 12)
+ test(curried(f)(), 12)
+ add1 = lambda n:n+1
+ test(add1(2), 3)
+ test(curried(add1)(3), 4)
+ add = lambda n,m:n+m
+ test(add(2,3), 5)
+ test(curried(add)(2)(3), 5)
+ curried(add)(2)
+ self.assertRaises(TypeError, curried(add)(2), 3, 4)
+ self.assertRaises(TypeError, curried(add), 2, 3)
+def testTakewhile(self):
+ test(takewhile(lambda _:True)("foo"), "foo")
+ test(takewhile(lambda _:False)("foo"), "")
+ test(takewhile(lambda c:c=='c')("foo"), "")
+ test(takewhile(lambda c:c!='c')("foo"), "foo")
+ test(takewhile(lambda c:c=='f')("foo"), "f")
+ test(map(takewhile(lambda c:c!='-'), ["foo-bar", "barbaz", "---"]),
+ ['foo', 'barbaz', ''])
+def testDropwhile(self):
+ test(dropwhile(lambda _:True)("foo"), "")
+ test(dropwhile(lambda _:False)("foo"), "foo")
+ test(dropwhile(lambda c:c=='c')("foo"), "foo")
+ test(dropwhile(lambda c:c!='c')("foo"), "")
+ test(dropwhile(lambda c:c=='f')("foo"), "oo")
+ test(map(dropwhile(lambda c:c!='-'), ["foo-bar", "barbaz", "---"]),
+ ['-bar', '', '---'])
+def testMapc(self):
+ test(mapc(lambda _:True)(range(3)), [True, True, True])
+ test(mapc(lambda n:n+1)(range(3)), range(1,4))
+ test(map(mapc(lambda n:n+1), [range(3), range(4,6), range(3,9)]),
+ [range(1,4), range(5,7), range(4,10)])
+### read CSV ###
+# this is rickety ad-hoc code, so the purpose of these tests is NOT to stress
+# the code; it's essentially throw-away, conformed to the shape of the data
+# anyway. These tests are meant to make sure the output of group_regions
+# is exactly the data structure I expect. This is tested using fake data.
+fake = map(list, transpose(['hey aV1L redC1C redV1L carV1H carV1L'.split(),
+ map(str, range(1,7)),
+ map(str, range(11,17)),
+ map(str, range(111,117))]))
+fake_regions = {'e':[2,3], 'f':[4]}
+grouped_sed = group_sed_in_gor()
+# answer = analyse(grouped_sed) # this makes the test run VERY slowly
+# because the real analysis takes about a minute or so.
+def testGroupWords(self):
+ answer = group_words(fake)
+ test(len(fake), 6)
+ test(typ(fake), '[[str]]')
+ test(typ(answer), "{str:{str:{str:[float]}}}")
+ test(set(answer.keys()), set('a red car'.split()))
+ test(set(answer['red'].keys()), set(["C1", "V1"]))
+ test(answer["red"]["C1"].keys(), ["C"])
+ test(answer["red"]["C1"]["C"], [3.0, 13.0, 113.0])
+def testGroupRegions(self):
+ answer = group_regions(fake_regions, group_words(fake))
+ test(typ(fake_regions), "{str:[int]}")
+ test(typ(answer), "{str:{str:{str:{str:[float]}}}}")
+ test(set(answer.keys()), set('ef'))
+ test(answer,
+ {'e':{'a': {"V1":{"L":[2,12]}},
+ 'red': {"C1":{"C":[3,13]}, "V1":{"L":[4,14]}},
+ 'car': {"V1":{"H":[5,15], "L":[6,16]}}},
+ 'f':{'a': {"V1":{"L":[112]}},
+ 'red': {"C1":{"C":[113]}, "V1":{"L":[114]}},
+ 'car': {"V1":{"H":[115], "L":[116]}}}})
+def testGroupSedInGor(self):
+ # is't the right type?
+ test(typ(grouped_sed), "{str:{str:{str:{str:[float]}}}}")
+ # did it work
+ test('ee' in grouped_sed, True)
+ test('wmen' in grouped_sed['ee'], True)
+ test('V1' in grouped_sed['ee']['wmen'], True)
+ test(grouped_sed["ee"]["wmen"]["V1"].keys(), ["H"])
+def testFlatten(self):
+ flat = flatten(grouped_sed)
+ fake = {'a':{'a':{'a':{'a':[1.0,2.0,3.0]},
+ 'b':{'b':[0.0,12.0,1.1]},
+ 'c':{'0':[0.0]}}}}
+ test(typ(flat), "[[[{str:[float]}]]]")
+ test(flat[0][51][0], grouped_sed["ee"]["wmen"]["V1"])
+ test(flatten(fake), [[[{'a':[1.0,2.0,3.0]},
+ {'b':[0.0,12.0,1.1]},
+ {'0':[0.0]}]]])
+def testFeatureSub(self):
+ # no-op
+ test(feature_sub({},{}), 0.0)
+ test(feature_sub({'a':[1.0]},{}), 1.0)
+ test(feature_sub({},{'a':[1.0]}), 1.0)
+ test(feature_sub({'a':[]}, {'a':[1.0]}), 0.0)
+ # unshared features
+ test(feature_sub({'a':[1.0]},{'b':[1.0]}), 2.0)
+ # shared features
+ test(feature_sub({'a':[1.0]},{'a':[1.0]}), 0.0)
+ test(feature_sub({'a':[1.0]},{'a':[0.0]}), 1.0)
+ test(feature_sub({'a':[1.0]},{'a':[0.5]}), 0.5)
+ # -cross
+ test(feature_sub({'a':[1.0]},{'a':[0.5,1.0]}), 0.25)
+ test(feature_sub({'a':[1.0]},{'a':[0.5,0.0]}), 0.75)
+ test(feature_sub({'a':[1.0,0.5]},{'a':[0.5,0.0]}), 0.5)
+ test(feature_sub({'a':[1.0,2.0]},{'a':[0.5,0.0]}), 1.25)
+ # -avg
+ test(feature_sub({'a':[1.0],'b':[0.0,1.0]},
+ {'a':[0.5,0.0],'b':[0.5]}), 1.25)
+ # whole fish execution!
+ test(feature_sub({'a':[1.0],'b':[0.0,1.0],'c':[0.0]},
+ {'a':[0.5,0.0],'b':[0.5],'d':[]}), 3.25)
+ test(feature_sub({'a':[1.0],'b':[0.0,3.0],'c':[0.0]},
+ {'a':[0.5,0.0],'b':[5.0],'d':[]}), 6.25)
+ # imbalanced number of informants
+ test((feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5], 'b':[0.5] }) ==
+ feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5,0.0],'b':[0.5,1.0]}) ==
+ feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5,1.0,0.0],'b':[0.5,0.0,1.0]})),
+ True)
+ test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5], 'b':[0.5] }),
+ 1.0)
+ test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5,0.0],'b':[0.5,1.0]}),
+ 1.0)
+ test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]},
+ {'a':[0.5,1.0,0.0],'b':[0.5,0.0,1.0]}),
+ 1.0)
+ # = 0 unshared features + avg(sum(|0.5-1.0| + |0.5-0.1| + .. + |
+ #NOTE: I don't think this is quite right. The missing features have way too
+ # much weight, so when C/V are compared (which happens often) they will
+ # quickly outweigh the other data. What is a good weight?
+ #total_feature_count = len(fs1+fs2)
+ # then maybe ??
+ #unshared_weight = (1/total_feature_count) * len(set(fs1) ^ set(fs2))
+ # ne and ld are the smallest
+def testSedAvg(self):
+ test(checktype(sed_avg), True)
+ test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], []), 0.0)
+ test(sed_avg([], [{"a":[0.0,1.0],"b":[0.3,3.0]}]), 0.0)
+ test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], [{'a':[0.0,1.0]}]), 1.5)
+ # is the cross >>= avg really working?
+ # that is, can we duplicate existing data with no change to the average?
+ test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}],
+ [{'a':[0.0,1.0]},{'a':[1.0,2.0]},
+ {'a':[0.0,1.0]},{'a':[1.0,2.0]}]),
+ sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}],
+ [{'a':[0.0,1.0]},{'a':[1.0,2.0]}]))
+ # OK, so uhhh .. is that enough? I don't know.
+def testSedAvgTotal(self):
+ test(checktype(sed_avg_total), True)
+ test(sed_avg_total(([[]], [{"a":[0.0,1.0],"b":[0.3,3.0]}])), 0.0)
+ test(sed_avg_total(([[{"a":[0.0,1.0],"b":[0.3,3.0]}]],
+ [[{'a':[0.0,1.0]}]])),
+ 0.75)
+ # average of multiple items remains the same
+ test(sed_avg_total(([[{"a":[0.0,1.0],"b":[0.3,3.0]}],
+ [{"a":[0.0,1.0],"b":[0.3,3.0]}]],
+ [[{'a':[0.0,1.0]}],[{'a':[0.0,1.0]}]])),
+ 0.75)
+ # the number of words in each region must be the same
+ self.assertRaises(TypeError,
+ sed_avg_total,
+ ([[{"a":[0.0,1.0],"b":[0.3,3.0]}],
+ [{"a":[0.0,1.0],"b":[0.3,3.0]}]],
+ [[{'a':[0.0,1.0]}],[{'a':[0.0,1.0]}],[{'b':[3.0]}]]))
+def testSedLevenshtein(self):
+ # same number of characters but some different features
+ test(sed_levenshtein(([{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]),
+ 5.0),
+ 3.5)
+ test(lev._levenshtein([{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}],
+ 5.0,
+ (lambda _:5.0,lambda _:5.0,feature_sub)),
+ [[0.0,5.0,10.0],
+ [5.0,1.5,6.5],
+ [10.0,6.25,3.5]])
+ # identical features but different values
+ test(sed_levenshtein(([{'a':[0.0,1.0]}, {'b':[1.0,1.5]}],
+ [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}]),
+ 5.0),
+ 0.75)
+ test(lev._levenshtein([{'a':[0.0,1.0]}, {'b':[1.0,1.5]}],
+ [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}],
+ 5.0,
+ (lambda _:5.0,lambda _:5.0,feature_sub)),
+ [[0.0,5.0,10.0],
+ [5.0,0.5,5.5],
+ [10.0,5.5,0.75]])
+def testSedDistance(self):
+ # same as sed_levenshtein with only one region
+ test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[0.0,0.0]}, {'a':[0.0,0.5]}]],
+ [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]),
+ 5.0),
+ 3.5)
+ # zip ignores unmatched extras
+ test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[0.0,0.0]}, {'a':[0.0,0.5]}]],
+ [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]),
+ 5.0),
+ 3.5)
+ # sum of two identical things is twice the original
+ test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[0.0,1.0]}, {'a':[1.0,1.5]}]],
+ [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}],
+ [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]),
+ 5.0),
+ 7.0)
+ # sum works otherwise (3.5 + 0.75 from sed_levenshtein)
+ test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}],
+ [{'a':[0.0,1.0]}, {'b':[1.0,1.5]}]],
+ [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}],
+ [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}]]),
+ 5.0),
+ 4.25)
+def testAnalyse(self):
+ # is it correct to use different averages for each pair?
+ # it doesn't make any difference for the actual results so I guess
+ # it doesn't matter.
+ # averages = map(sed_avg_total, lst.all_pairs(flatten(grouped_sed))),
+ # test(stdev(averages) < 1.0, True)
+ pass
+def testMak(self):
+ test(mak('bool'), False)
+ test(mak('int'), 0)
+ test(mak('float'), 0.0)
+ test(mak('str'), '')
+ test(mak('[int]'), [0])
+ test(mak('[bool]'), [False])
+ test(mak('{str:int}'), {'':0})
+ test(mak('(int,int,str)'), (0,0,''))
+ self.assertRaises(ValueError, mak, '[int}')
+ self.assertRaises(ValueError, mak, '{int}') # ironically, of course, this
+ # is valid Python 3.0 syntax. (this is the real reason Guido wanted to
+ # get rid of lambda)
+ self.assertRaises(ValueError, mak, '[]')
+ self.assertRaises(ValueError, mak, '{}')
+ self.assertRaises(ValueError, mak, '{:}')
+ self.assertRaises(ValueError, mak, '()')
+
+tester.runTest(__import__(__name__), locals())