diff --git a/ACL07_poster_equations.tex b/ACL07_poster_equations.tex new file mode 100644 index 0000000..d97c713 --- /dev/null +++ b/ACL07_poster_equations.tex @@ -0,0 +1,25 @@ +\documentclass[11pt]{article} +\usepackage[vcentering]{geometry} % dvips not needed for pdflatex +% \usepackage[vcentering,dvips]{geometry} +% % eqn 1 +% \geometry{papersize={20mm,15mm},total={30mm,24mm}} +% % eqn 1 +% \geometry{papersize={50mm,18mm},total={65mm,26mm}} +% % eqn 1 +% \geometry{papersize={40mm,18mm},total={52mm,26mm}} +% eqn 1 +\geometry{papersize={68mm,18mm},total={85mm,26mm}} +\begin{document} +\thispagestyle{empty} +% \[f = \frac{c}{N}\] + +% \[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\] + +% \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\] + +\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\] +\end{document} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: diff --git a/Lev.hs b/Lev.hs new file mode 100644 index 0000000..c8c1044 --- /dev/null +++ b/Lev.hs @@ -0,0 +1,22 @@ +module Lev where +import qualified Data.Map as Dct +import Control.Monad (liftM) +enum = zip [0..] +takeFail 0 _ = Just [] +takeFail _ [] = Nothing +takeFail n (x:xs) = liftM (x:) (takeFail (n - 1) xs) +window n l = case takeFail n l of + Nothing -> [] + Just xs -> xs : window n (tail l) +-- _levenshtein :: (Enum b, Num b, Ord b) => + -- [t] -> [a] -> b -> (a -> b, t -> b, a -> t -> b) -> [b] +_levenshtein ss ts indel (ins,del,sub) = + let initial l = [0,indel..indel * fromIntegral (length l)] + in foldl (\ table (i,s) -> + (foldl (\ row (t,[prev,prev']) -> + minimum [ins t + head row, del s + prev', sub s t + prev] + : row) + [i] +-- doesn't work without double reverse. i suspect only (window 2) needs reverse + (zip ts . window 2 {- . reverse-} $ table))) + (initial ts) {-(reverse (initial ts))-} (zip (tail (initial ss)) ss) diff --git a/Main.hs b/Main.hs new file mode 100644 index 0000000..ff4c9c5 --- /dev/null +++ b/Main.hs @@ -0,0 +1,4 @@ +module Main where +import qualified Sed +main :: IO () +main = print . Sed.analyse =<< Sed.groupSedInGor diff --git a/Sed.hs b/Sed.hs new file mode 100644 index 0000000..f491f7d --- /dev/null +++ b/Sed.hs @@ -0,0 +1,90 @@ +module Sed where +import Text.Regex.Posix +import Text.CSV +import Char +import qualified Data.Map as Dct +import Data.List +import Data.Ord +import qualified Lev +--- util --- +(&) = flip (.) +dctCollapse xs k v = Dct.fromAscListWith + (++) + (sortBy (comparing fst) [(k x, [v x]) | x <- xs]) +listExtract ns xs = extract ns xs 0 + where extract [] _ _ = [] + extract (n:ns) xs i = + xs !! (n - i) : extract ns (drop (n - i) xs) n +kross (xs,ys) = do x <- xs; y <- ys; return (x,y) +both f (x,y) = (f x, f y) +pairs [] = [] +pairs (x:xs) = [(x,y) | y <- xs] ++ pairs xs +average l = sum l / fromIntegral (length l) +--- read CSV --- +segment = head & dropWhile isLower & segmentName + where segmentName s = seg++n where (seg,n,_) = + s =~ "[0-9]" :: (String,String,String) +features (title:ns) = (feature title, map read ns) + where feature s = feat where (_,_,feat) = + s =~ "[0-9]" :: (String,String,String) +groupWords csv = Dct.map (fillsegments . phones) words + where words = dctCollapse (tail csv) (head & takeWhile isLower) id + fillsegments = Dct.mapWithKey makesegment + phones l = Dct.map Dct.fromList (dctCollapse l segment features) +makesegment typ d = + let size = length . head . Dct.elems $ d + in d `Dct.union` Dct.map (replicate size) (featdict Dct.! init typ) +groupRegions regions words = Dct.map outermost regions + where outermost range = Dct.map inner words + where inner = Dct.map (Dct.map (listExtract (map ((-) 2) range))) +groupSedInGor = do + csv <- parseCSVFromFile "sed.csv" + case csv of + Left err -> error ("oh no:" ++ show err) + Right rows -> return $ groupRegions regions $ groupWords $ transpose rows +--- analysis --- +flatten = map (map Dct.elems . Dct.elems) . Dct.elems +analyse sed = Dct.fromList . zip edges . map (sedDistance avgregions) $ regions + where edges = pairs (Dct.keys sed) + regions = pairs (flatten sed) + avgregions = average (map sedAvgTotal regions) +featureSub seg1 seg2 = fromIntegral (Dct.size(seg1 `symmetric_difference` seg2)) + + sum (map abs (Dct.elems (Dct.intersectionWith (-) seg1 seg2))) + where symmetric_difference d e = Dct.union (e `Dct.difference` d) + (d `Dct.difference` e) +sedDistance avg = sum . map (sedLevenshtein avg) . uncurry zip +transposeWord word = transpose (map transposeSegment word) + where transposeSegment seg = map (Dct.fromList . zip (Dct.keys seg)) + (transpose (Dct.elems seg)) +sedLevenshtein a = average . map (levenshtein a) . kross . both transposeWord +levenshtein a (w1,w2) = + head $ Lev._levenshtein w1 w2 a (const a, const a, featureSub) +sedAvg :: (Ord k, Fractional a) => ([Dct.Map k [a]], [Dct.Map k [a]]) -> a +sedAvg = both (concat . transposeWord) & kross & map (uncurry featureSub) & average +sedAvgTotal (region1,region2) = average (map sedAvg (zip region1 region2)) / 2.0 +--- data --- +featdict = Dct.fromList [("C", Dct.fromList [("GL",0.0), ("V",0.0), ("H",0.0), + ("PV",0.0), ("L",0.0)]), + ("V", Dct.fromList [("B",1.0), ("H",1.0), + ("L",1.0), ("R",1.0)]), + ("R", Dct.fromList [("MN",1.5), ("PL",1.0)]), + ("MULT", Dct.fromList [("MULT", 1.0)]), + ("VC", Dct.empty)] +regions :: Dct.Map String [Int] +regions = Dct.fromList [("ne", [2..11]++[17..23]), + ("nw", [11..17]++[23..41]++[77..83]), + ("yk", [41..75]), + -- ++[75..77] (Isle of Man isn't on GOR map) + ("wm", [112..126]++[140..157]), + ("em", [83..112]++[126..140]++[157..172]), + ("ee", [172..191]++[217..238]), + ("se", [206..217]++[262..279]++[302..315]), + ("sw", [193..206]++[240..262]++[279..302]), + ("ld", [238..240])] +test = [["", "applV1H", "applV1L", "applC1GL", "applV2", "catcV1H", "askMULT0MULT", "askV1H", "askV1B"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], + ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"]] diff --git a/nord/RepairTalBanken.hs b/nord/RepairTalBanken.hs new file mode 100644 index 0000000..6d7db19 --- /dev/null +++ b/nord/RepairTalBanken.hs @@ -0,0 +1,56 @@ +------- yet another uncrosser ------------ +example = Node ("S", 555) + [Node ("VP",510) + [Node ("NP",502) + [Leaf ("DET",0) ("Das",0), Leaf ("N",1) ("Buch",1), Leaf ("PRON",4) ("ihm",4)] + , Leaf ("V",2) ("gegeben",2) + , Leaf ("PART",5) ("habe",5)] + , Node ("NP",501) [Leaf ("PRON",3) ("ich",3)]] +sexample = spanTree example +spanTree :: Tree (String, Integer) -> Tree (Integer, Integer) +spanTree (Leaf (_,i) _) = Leaf (i,i+1) (i,i+1) +spanTree (Node _ kids) = Node (minimum starts, maximum ends) trees + where trees = map spanTree kids + starts = map (fst . dat) trees + ends = map (snd . dat) trees +uncross' :: [Tree (Integer,Integer)] -> [Tree (Integer,Integer)] +uncross' [] = [] +uncross' (Leaf a w : siblings) = Leaf a w : uncross' siblings +uncross' (Node a kids : siblings) = uncross''.both depair.span continuous.pairs + $ kids + where uncross'' (co,[]) = co ++ uncross' siblings + uncross'' (co,disco) = co ++ uncross' (insert siblings disco) +pairs l = zip l (tail l) +both f (x,y) = (f x, f y) +continuous (t, t') = snd (dat t) == fst (dat t') +depair l = (fst $ head l) : map snd l +insert = (++) -- insert has to stick disco in siblings somewhere and then uncross + -- it all. Not necessarily in that order. +{-uncross (Node a kids) = Node a (uncross' kids) +uncross l = l +uncross' :: [Siblings] -> [Siblings] -- but uncrossed +-- OK the problem is that insert might need to drop disco down a couple of levels into siblings +-- in other words, the first step is the check what siblings disco belongs IN or AFTER +-- then you may have to insert down, ie repeat the insert for the chosen sibling's kids +-- ... told you there might be a lot of consing! +insert siblings disco = let (before,actual:after) = splitBy ((lhs disco) >) siblings in + if rhs disco > lhs actual then -- or something like this + before ++ actual : disco ++ after -- um..you get the idea + else + before ++ (insert (kids actual) disco : after) -- whoo CONS! -} + -- also this recursive step should do some uncrossing of before and after, right? + +{- The idea is that you start at the leftmost kid of a Node. + You take as much as is continuous and you cons that onto the rest of the siblings+disco + after that has all been uncrossed. + co : uncross (insert disco siblings) + except that uncross has to take additional arguments? + also some uncrossings may have to burrow arbitrarily deep. I'm not sure of the limit yet. + Anyway you'd have to do it either way, bottom-up or top-down, so at least top-down it's + easier to maintain pointers to it all even if there is a lot of consing involved. + + actually now that I sleep on it, I'm not sure about arbitrary deepness. I think that's + needed only if you want to try Wolfgang's bottom-up style. Adriane Boyd's split style + just attaches the disco part as a sibling to the co part I think. +-} +T diff --git a/nord/TestDistance.hs b/nord/TestDistance.hs new file mode 100644 index 0000000..d72160b --- /dev/null +++ b/nord/TestDistance.hs @@ -0,0 +1,49 @@ +import Distance hiding (main) +import Test.QuickCheck.Batch +import Test.QuickCheck +import Control.Arrow ((&&&)) +import Util +import Data.List (group, sort, nub) +import Data.List.Split (splitOn) +import qualified Data.Map as Map +{- instance Arbitrary Char where + arbitrary = choose ('\32', '\128') + coarbitrary c = variant (ord c `rem` 4) -} + +exampleLines = splitOn "\n" "Morgoth\na\nb\n***\na\nc" + +prop_histogram_list :: [Int] -> Bool -- oops, this is a test for Util. +prop_histogram_list l = Map.toList (histogram l) == listhist l + where listhist = sort & group & map (head &&& length) +prop_histogram_empty :: [Int] -> Bool +prop_histogram_empty l = (l == []) == (histogram l == Map.empty) + +prop_r_empty :: Bool +prop_r_empty = r [] == 0.0 +prop_r_one = r [(1,2)] == 1.0 +prop_r_two = r [(1,2), (10,20)] == 11.0 + +prop_cmp_truncate :: [Int] -> [Int] -> Property +prop_cmp_truncate r1 r2 = (r2 /= [] && r1 /= []) ==> + length (cmp r1 r2) == length (nub r2) +prop_cmp_zeroes_r1 :: [Int] -> [Int] -> Property +prop_cmp_zeroes_r1 r1 r2 = (r2 /= [] && r1 /= []) ==> + countBy (/=0.0) (map fst rcompare) + == Map.size (histogram r1 `Map.intersection` histogram r2) + where rcompare = cmp r1 r2 +prop_cmp_iterate :: [Int] -> [Int] -> Property +prop_cmp_iterate r1 r2 = (r1 /= [] && r2 /= []) ==> all (\ ((a,b),(c,d)) -> abs (a - b) < abs (c - d)) $ zip (cmp r1 r2) $ map (\ (f,n) -> (fromIntegral $ Map.findWithDefault 0 f (histogram r1), fromIntegral n)) (Map.toList $ histogram r2) + where rcompare = cmp r1 r2 +main = runTests + "The Basics" + TestOptions {no_of_tests = 100 + ,length_of_tests = 1 + , debug_tests = False} + [run prop_histogram_list + , run prop_histogram_empty + , run prop_r_empty + , run prop_r_one + , run prop_r_two + , run prop_cmp_truncate + , run prop_cmp_zeroes_r1 + , run prop_cmp_iterate] diff --git a/prg-pres.tex b/prg-pres.tex new file mode 100644 index 0000000..715b5c7 --- /dev/null +++ b/prg-pres.tex @@ -0,0 +1,255 @@ +\documentclass{beamer} + +\usetheme{Ilmenau} +\usepackage[all]{xy} + +\title{Syntax Distance for Dialectometry \\ Progress Report : Parsing + Swedish Badly} +\author{Nathan Sanders} +\date{\today} + +\begin{document} + +\begin{frame} + \frametitle{Welcome to Jones!} + + \begin{itemize} + \item Try Haskell, a Very Nice Functional Language! + \item We have met the enemy and he is us. + \item Who would win? Henry VIII or George III? + \item Only the pure of heart can remove the sword from the stone. + \item With great powder comes great rapidity. + \item 640K should be enough for anyone. + \item A spoonful of sugar makes the medicine go down. + \item Try O'Caml, a Very Nice Functional Language! + \item Try Scala, a Very Nice Functional Language! + \item Try Nice, a Very Clean Functional Language! + \item Try Clean, a Very Nice Functional Language! + \item Try Nemerle, a Very Nice Functional Language! + \item Try Miranda\texttrademark, a Functional Language! + \item Try F\#, a Very Nice Functional Language! + \item Try Scheme, a Very Nice Functional Language! + \item Try Clojure, a Very Nice Functional Language! + \end{itemize} +\end{frame} + +\frame{\titlepage} +\section{Experiment} +\begin{frame} + \frametitle{Experiment} + Find syntactic differences between dialects of Swedish + \begin{enumerate} + \item Corpus: Swediasyn, unparsed interviews transcribed and glossed + to standard Swedish + \item Training corpus: Talbanken, 300 Kwords of parsed spoken and + written Swedish from the late 70s. + \item Annotators: TnT, MaltParser and the Berkeley parser, all + trained on Talbanken. + \end{enumerate} +\end{frame} +\section{Prototype} +\begin{frame} + \frametitle{Convert Talbanken} + From TIGER-XML, a flat XML representation in ISO-8859-1 (Latin1) + \begin{itemize} + \item To TnT POS (word --- tab --- POS) + \item To PTB (nested s-exps, which are kind of annoying to produce + from TIGER-XML) + \item Swediasyn is UTF-8, and all the Java-based parsers require it, + so Talbanken must be converted. + \end{itemize} +\end{frame} +\begin{frame}[fragile] + \frametitle{Talbanken Example} +\begin{verbatim} + + + + + + ... + + + + + + + ... + +\end{verbatim} +\end{frame} +\begin{frame}[fragile] + \frametitle{TnT Example} +\begin{verbatim} +ja POPPHH +tycker VVPS +för PR +(imagine f\:or above, pdflatex doesn't like UTF-8 either) +min POXPHHGG +del NN +\end{verbatim} +\end{frame} +\begin{frame}[fragile] + \frametitle{PTB Example} +\begin{verbatim} +(ROOT (S + (POPPHH ja) + (VVPS tycker) + (PP (PR för) (POXPHHGG min) (NN del)) + (S + (XP (UKAT att) (UKAT att)) + (RJ >>) + (EHRJPU >>) + (POOP de) + (AVPS e) + (NP (P P (ABZA för) (ABZA mycke) (PR av)) + (EN en) + (NN monolog) + (S (PORP som) (NNDDHH prästen) (VVPS håller)))))) +\end{verbatim} +\end{frame} +\begin{frame} + \frametitle{T'n'T} + \begin{itemize} + \item Training: No problems yet, default parameters + \item Classification: Low vocabulary overlap between Talbanken and + dialects of Swediasyn. (around 15\% unknown tokens, more for small + corpora) + \item There were more before converting Talbanken to UTF-8. + \end{itemize} +\end{frame} +\begin{frame} + \frametitle{MaltParser} + \begin{itemize} + \item Training: Already done and downloadable. Thanks Joakim! + \item Conversion: POS-tagged Swedia to ConLL is trivial because ConLL is a flat + format. + \item Classification: Default parameters, results don't appear to be very good. + \end{itemize} +\end{frame} +\begin{frame}[fragile] + \frametitle{Example} +\begin{columns} +\column[c]{0.7\textwidth} +\begin{tabular}{ccccc} +1& varit& AVSN& 0& ROOT\\ +2& v\"aldigt& AJ& 3& AA \\ +3& intresserad& AJ& 7& SS\\ +4& av& PR& 3& ET \\ +5& det& PODP& 4& PA \\ +6& h\"ar& ID& 5& HD \\ +7& \aa{}ka& VVIV& 0& ROOT\\ +8& ikring& PR& 0& ROOT\\ +9& och& ++OC& 0& ROOT\\ +10& titta& VVIV& 0& ROOT\\ +11& p\aa{}& PR& 0& ROOT\\ +12& platser& NN& 11& PA \\ +13& . & IP& 1& IP \\ +\end{tabular} +\column[c]{0.3\textwidth} +\textit{Been interested in going around there and looking at +places}. (translate.google.com) +\end{columns} +\end{frame} +\begin{frame} + \frametitle{Berkeley Parser} + \begin{itemize} + \item Training: Takes 2 GB. No more, no less. So I used + banks (Don't tell Josh). + \item Classification: Default parameters, I haven't looked at + results closely. Takes about 3 days, but less than 1 GB. + \end{itemize} +\end{frame} +\begin{frame}[fragile] + \frametitle{Example} +\begin{columns} +\column[c]{0.6\textwidth} +\begin{verbatim} +(S (VVPS AVSN) + (VNDD AJ) + (VVIV AJ) + (CNP (NNDD PR) + (NNDD PODP) + (PN__HH ID) + (NNDDHHGG VVIV)) + (VN__SS PR) + (VVPS ++OC) + (NP (NP (NN VVIV) + (PN__HH PR)) + (PN__HH NN) + (ID IP))) +\end{verbatim} +\column[c]{0.4\textwidth} +(varit AVN) \\ +(v\"a{}ldigt AJ) \\ +(intresserad AJ) \\ +(av PR) \\ +(det PODP) \\ +(h\"ar ID) \\ +(\aa{}ka VVIV) \\ +(ikring PR) \\ +(och ++OC) \\ +(titta VVIV) \\ +(p\aa{} PR) \\ +(platser NN)\\ +(. IP) +\end{columns} +\end{frame} +\begin{frame} + \frametitle{Features} + \begin{itemize} + \item Trigrams: Trivial, same as before. + \item Leaf-ancestors: Same as before, except now in Haskell. + \item Dependency paths: for each leaf, record the path to the root. + \end{itemize} +\end{frame} +\begin{frame} + \frametitle{Leaf-Ancestor Paths} + +\begin{columns} +\column[c]{0.5\textwidth} +\[\xymatrix{ + &&\textrm{S} \ar@{-}[dl] \ar@{-}[dr] &&\\ + &\textrm{NP} \ar@{-}[d] \ar@{-}[dl] &&\textrm{VP} \ar@{-}[d]\\ + \textrm{Det} \ar@{-}[d] & \textrm{N} \ar@{-}[d] && \textrm{V} \ar@{-}[d] \\ +\textrm{the}& \textrm{dog} && \textrm{barks}\\} +\] +\column[c]{0.5\textwidth} +\begin{itemize} +\item S-NP-Det-The +\item S-NP-N-dog +\item S-VP-V-barks +\end{itemize} +\end{columns} + +\end{frame} +\begin{frame} + \frametitle{Dependency Paths} + +\begin{columns} +\column[c]{0.5\textwidth} +\[\xymatrix{ +& & root \\ +DET \ar@/^/[r] & NP\ar@/^/[r] & V \ar@{.>}[u] \\ +The & dog & barks +} +\] +\column[c]{0.5\textwidth} +\begin{itemize} +\item root-V-N-Det-the +\item root-V-N-dog +\item root-V-barks +\end{itemize} +\end{columns} + +\end{frame} +\begin{frame}[fragile] + \frametitle{Distance} + \begin{itemize} + \item g++ -O2 + \item Same as before: \verb+r = map (uncurry (-) & abs) & sum+ + \item Significance test is only 100 iterations, down from 1000. + \item May be ghc -O2 soon. + \end{itemize} +\end{frame} +\end{document} diff --git a/sanders_acl07.tex b/sanders_acl07.tex new file mode 100644 index 0000000..439a81a --- /dev/null +++ b/sanders_acl07.tex @@ -0,0 +1,521 @@ +\documentclass[11pt,letterpaper]{article} +\pdfpagewidth=\paperwidth +\pdfpageheight=\paperheight +\usepackage{times} +\usepackage{latexsym} +\usepackage{acl07} +\usepackage[all]{xy} +\author{\textbf{Nathan C. Sanders} \\ Department of Linguistics \\ + Indiana University \\ + Bloomington, IN 47405, USA \\ \texttt{ncsander@indiana.edu}} +\title{Measuring Syntactic Difference in British English} +\begin{document} +\maketitle +%TODO: +% 7. Check results of r^2 test and write them into the poster +% 9. *Maybe* real-language examples for methods. But if you have trouble with +% abc/xyz how do you handle REAL CS?! Or theoretical syntax? Or PERL?!? +\begin{abstract}Recent work by + \newcite{nerbonne06} has provided a foundation for measuring + syntactic differences between corpora. It uses part-of-speech trigrams as an + approximation to syntactic structure, comparing the trigrams of two + corpora for statistically significant differences. + + This paper extends the method and its application. It extends the + method by using leaf-path ancestors of \newcite{sampson00} instead of + trigrams, which capture internal syntactic structure---every leaf in + a parse tree records the path back to the root. + + The corpus used for testing is the International Corpus of English, + Great Britain \cite{nelson02}, which contains syntactically + annotated speech of Great Britain. The speakers are grouped into + geographical regions based on place of birth. This is different in + both nature and number than previous experiments, which found + differences between two groups of Norwegian L2 learners of + English. We show that dialectal variation in eleven British regions from the ICE-GB + is detectable by our algorithm, using both leaf-ancestor paths and trigrams. +\end{abstract} + +\section{Introduction} +In the measurement of linguistic distance, older work such as +\newcite{seguy73} was able to measure distance in most areas of +linguistics, such as phonology, morphology, and syntax. The +features used for comparison were hand-picked based on +linguistic knowledge of the area being surveyed. These features, +while probably lacking in completeness of coverage, certainly allowed +a rough comparison of distance in all linguistic domains. +In contrast, computational methods have +focused on a single area of language. For example, a method for +determining phonetic distance is given by \newcite{heeringa04}. Heeringa +and others have also done related work on phonological distance in +\newcite{nerbonne97} and \newcite{gooskens04}. A measure of syntactic +distance is the obvious next step: \newcite{nerbonne06} provide one such +method. This method approximates internal syntactic structure using +vectors of part-of-speech trigrams. The trigram types can then be +compared for statistically significant differences using a permutation +test. + +This study can be extended in a few ways. +First, the trigram approximation works well, but it +does not necessarily capture all the information of syntactic +structure such as long-distance movement. Second, +the experiments did not test data for geographical dialect variation, +but compared two generations of Norwegian L2 learners of English, with +differences between ages of initial acquisition. + +We address these areas by using the syntactically annotated speech +section of the International Corpus of English, Great Britain (ICE-GB) +\cite{nelson02}, which provides a corpus with full syntactic annotations, +one that can be divided into groups for comparison. The sentences +of the corpus, being represented as parse trees rather than a vector +of POS tags, are +converted into a vector of leaf-ancestor paths, which were developed +by \newcite{sampson00} to aid in parser evaluation by providing a way to +compare gold-standard trees with parser output trees. + +In this way, each sentence produces its own vector of leaf-ancestor +paths. Fortunately, the +permutation test used by \newcite{nerbonne06} is already designed to +normalize the effects of differing sentence length when combining POS +trigrams into a single vector per region. The only change needed is +the substitution of leaf-ancestor paths for trigrams. + +The speakers in the ICE-GB are divided by place of birth into +geographical regions of +England based on the nine Government Office Regions, plus Scotland and +Wales. The +average region contains a little over 4,000 +sentences and 40,000 words. This is less than the size of the +Norwegian corpora, and leaf-ancestor paths are more +complex than trigrams, meaning that the amount of data required for +obtaining significance should increase. Testing on smaller corpora +should quickly show whether corpus size can be reduced without losing +the ability to detect differences. + +Experimental results show that differences can be detected among the +larger regions: as should be expected with a method +that measures statistical significance, larger corpora allow easier +detection of significance. The limit seems to be around 250,000 words for +leaf-ancestor paths, and 100,000 words for POS trigrams, but more careful +tests are needed to verify this. +Comparisons to judgments of dialectologists have not yet +been made. The comparison is difficult because of the +difference in methodology and amount of detail in +reporting. Dialectology tends to collect data from a few informants +at each location and to provide a more complex account of relationship +than the like/unlike judgments provided by permutation tests. + +\section{Methods} + +The methods used to implement the syntactic difference test come from two +sources. The primary source is the syntactic comparison of +\newcite{nerbonne06}, which uses a permutation test, explained in +\newcite{good95} and in particular for linguistic purposes in +\newcite{kessler01}. Their permutation test +collects POS trigrams from a random subcorpus of sentences +sampled from the combined corpora. The trigram frequencies are +normalized to neutralize the +effects of sentence length, then compared to the +trigram frequencies of the complete corpora. +% \cite{nerbonne06} compare +% two generations of Norwegian L2 learners of English. + +The principal difference between the work of \newcite{nerbonne06} and ours is +the use of leaf-ancestor paths. +Leaf-ancestor paths were developed by \newcite{sampson00} for +estimating parser performance by providing a measure of similarity of +two trees, in particular a gold-standard tree and a machine-parsed +tree. This distance is not used for our method, since for our purposes, +it is enough that leaf-ancestor paths represent syntactic information, such as +upper-level tree structure, more explicitly than trigrams. + +The permutation test used by \newcite{nerbonne06} is independent of the +type of item whose frequency is measured, treating the items as atomic +symbols. Therefore, leaf-ancestor paths should do just as well as +trigrams as long as they do not introduce any additional constraints +% constraints should=> statistical anomalies! +on how they are generated from the corpus. Fortunately, this is not +the case; \newcite{nerbonne06} generate $N-2$ POS trigrams from each +sentence of length $N$; we generate $N$ leaf-ancestor paths from each +parsed sentence in the corpus. Normalization is needed to account for +the frequency differences caused by sentence length variation; it is +presented below. Since the same number (minus two) of trigrams and +leaf-ancestor paths are generated for each sentence the same +normalization can be used for both methods. + +\subsection{Leaf-Ancestor Paths} + +Sampson's leaf-ancestor paths represent syntactic structure +by aggregating nodes starting from each leaf and proceeding up to +the root---for our experiment, the leaves are parts of speech. +This maintains constant input from +the lexical items of the sentence, while giving the parse tree some +weight in the representation. + +For example, the parse tree +\[\xymatrix{ + &&\textrm{S} \ar@{-}[dl] \ar@{-}[dr] &&\\ + &\textrm{NP} \ar@{-}[d] \ar@{-}[dl] &&\textrm{VP} \ar@{-}[d]\\ + \textrm{Det} \ar@{-}[d] & \textrm{N} \ar@{-}[d] && \textrm{V} \ar@{-}[d] \\ +\textrm{the}& \textrm{dog} && \textrm{barks}\\} +\] +creates the following leaf-ancestor paths: + +\begin{itemize} +\item S-NP-Det-The +\item S-NP-N-dog +\item S-VP-V-barks +\end{itemize} + +There is one path for each word, and the root appears +in all four. However, there can be ambiguities if some +node happens +to have identical siblings. Sampson gives the example +of the two trees +\[\xymatrix{ + &&\textrm{A} \ar@{-}[dl] \ar@{-}[dr] &&&\\ + &\textrm{B} \ar@{-}[d] \ar@{-}[dl] &&\textrm{B} \ar@{-}[d] \ar@{-}[dr] & \\ + \textrm{p} & \textrm{q} && \textrm{r} & \textrm{s} \\ +} +\] +and +\[\xymatrix{ + &&\textrm{A} \ar@{-}[d] &&&\\ + &&\textrm{B} \ar@{-}[dll] \ar@{-}[dl] \ar@{-}[dr] \ar@{-}[drr]&&& \\ + \textrm{p} & \textrm{q} && \textrm{r} & \textrm{s} \\ +} +\] +which would both produce + + \begin{itemize} + \item A-B-p + \item A-B-q + \item A-B-r + \item A-B-s + \end{itemize} + + There is no way to tell from the paths which leaves belong to which + B node in the first tree, and there is no way to tell the paths of + the two trees apart despite their different structure. To avoid this + ambiguity, Sampson uses a bracketing system; brackets are inserted + at appropriate points to produce + \begin{itemize} + \item $[$A-B-p + \item A-B]-q + \item A-[B-r + \item A]-B-s + \end{itemize} +and + \begin{itemize} + \item $[$A-B-p + \item A-B-q + \item A-B-r + \item A]-B-s + \end{itemize} + +Left and right brackets are inserted: at most one +in every path. A left bracket is inserted in a path containing a leaf +that is a leftmost sibling and a right bracket is inserted in a path +containing a leaf that is a rightmost sibling. The bracket is inserted +at the highest node for which the leaf is leftmost or rightmost. + +It is a good exercise to derive the bracketing of the previous two trees in detail. +In the first tree, with two B +siblings, the first path is A-B-p. Since $p$ is a leftmost child, +a left bracket must be inserted, at the root in this case. The +resulting path is [A-B-p. The next leaf, $q$, is rightmost, so a right +bracket must be inserted. The highest node for which it is rightmost +is B, because the rightmost leaf of A is $s$. The resulting path is +A-B]-q. Contrast this with the path for $q$ in the second tree; here $q$ +is not rightmost, so no bracket is inserted and the resulting path is +A-B-q. $r$ is in almost the same position as $q$, but reversed: it is the +leftmost, and the right B is the highest node for which it is the +leftmost, producing A-[B-r. Finally, since $s$ is the rightmost leaf of +the entire sentence, the right bracket appears after A: A]-B-s. + +At this point, the alert reader will have +noticed that both a left bracket and right bracket can be inserted for +a leaf with no siblings since it is both leftmost and rightmost. That is, +a path with two brackets on the same node could be produced: A-[B]-c. Because +of this redundancy, single children are +excluded by the bracket markup algorithm. There is still +no ambiguity between two single leaves and a single node with two +leaves because only the second case will receive brackets. + +% See for yourself: +% \[\xymatrix{ +% &\textrm{A} \ar@{-}[dl] \ar@{-}[dr] &\\ +% \textrm{B} \ar@{-}[d] &&\textrm{B} \ar@{-}[d] \\ +% \textrm{p} && \textrm{q} \\ +% } +% \] + +% \[\xymatrix{ +% &\textrm{A} \ar@{-}[d] &\\ +% &\textrm{B} \ar@{-}[dl] \ar@{-}[dr] & \\ +% \textrm{p} && \textrm{q} \\ +% } +% \] + +% \newcite{sampson00} also gives a method for comparing paths to obtain an +% individual path-to-path distance, but this is not necessary for the +% permutation test, which treats paths as opaque symbols. + +\subsection{Permutation Significance Test} + +With the paths of each sentence generated from the corpus, then sorted +by type into vectors, we now try to determine +whether the paths of one region occur in significantly different +numbers from the paths of another region. To do this, we calculate some +measure to characterize the difference between two vectors as a single +number. \newcite{kessler01} creates a simple measure called the +{\sc Recurrence} metric ($R$ hereafter), which +is simply the sum of absolute differences of all path token counts +$c_{ai}$ from the first corpus $A$ and $c_{bi}$ from the second corpus +$B$. +\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\] +However, to find out if the value of $R$ is significant, we +must use a permutation test with a Monte Carlo technique described by +\newcite{good95}, following +closely the same usage by \newcite{nerbonne06}. The intuition behind +the technique is to compare the $R$ of the two corpora with the $R$ of +two random subsets of the combined corpora. If the random subsets' $R$s +are greater than the $R$ of the two actual corpora more than $p$ percent +of the time, then we can reject the null hypothesis that the two were +are actually drawn from the same corpus: that is, we can assume that +the two corpora are different. + +However, before the $R$ values can be compared, the path counts in the +random subsets must +be normalized since not all paths will occur in every subset, and +average sentence length will differ, causing relative path frequency +to vary. There are two normalizations that must occur: normalization +with respect to sentence length, and +normalization with respect to other paths within a subset. + +The first stage of normalization normalizes the counts for each path +within the pair of vectors $a$ and $b$. The purpose is to neutralize the +difference in sentence length, in which longer sentences with more +words cause paths to be relatively less frequent. +Each count is converted to a frequency $f$ \[f=\frac{c}{N} \] where +$c$ is either $c_{ai}$ or $c_{bi}$ from above and $N$ is the length of the +containing vector $a$ or $b$. This produces two frequencies, $f_{ai}$ and +$f_{bi}$.Then the frequency is scaled +back up to a redistributed count by the equation +\[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\] +This will redistribute the total of a pair from $a$ and $b$ based on +their relative frequencies. In other words, the total of each path +type $c_{ai} + c_{bi}$ will remain the same, but the values of +$c_{ai}$ and $c_{bi}$ will be balanced by their frequency +within their respective vectors. + +For example, assume that the two corpora have 10 sentences each, with +a corpus $a$ with only 40 words and another, $b$, with 100 words. This +results in $N_a = 40$ and $N_b = 100$. Assume also that there is a +path $i$ that occurs in both: $c_{ai} = 8$ in $a$ and $c_{bi} = 10$ in +$b$. This means that the relative frequencies are $f_{ai} = 8/40 = 0.2$ +and $f_{bi} = 10/100 = 0.1$. The first normalization will redistribute the +total count (18) according to relative size of the frequencies. So +\[c_{ai}' = \frac{0.2(18)}{0.2+0.1} = 3.6 / 0.3 = 12\] and +\[c_{bi}' = \frac{0.1(18)}{0.2+0.1} = 1.8 / 0.3 = 6\] +Now that 8 has been scaled to 12 and 10 to 6, the effect of sentence length +has been neutralized. This reflects the intuition that something that +occurs 8 of 40 times is more important than something that occurs 10 +of 100 times. + +% this is the (*2n) / N bit +The second normalization normalizes all values in both +permutations with respect to each other. This is simple: find the +average number of times each path appears, then divide each scaled +count by it. This produces numbers whose average is 1.0 and whose +values are multiples of the amount that they are greater than the average. +The average path +count is $N / 2n$, where $N$ is the number of path tokens in +both the permutations and $n$ is the number of path types. Division by +two is necessary since we are multiplying counts from a single permutation by +token counts from both permutations. Each type entry in the +vector now becomes \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\] + +Starting from the previous example, this second normalization first +finds the average. Assuming 5 unique paths (types) for $a$ and 30 for +$b$ gives \[n = 5 + 30 = 35\] and +\[N = N_a + N_b = 40 + 100 = 140\] +Therefore, the average path type has $140 / 2(35) = 2$ +tokens in $a$ and $b$ respectively. Dividing $c_{ai}'$ and $c_{bi}'$ by this average gives $s_{ai} = 6$ +and $s_{bi} = 3$. In other words, $s_{ai}$ has 6 times more tokens +than the average path type. + +%% This is definitely the code at the end of normaliseall +% The third stage of normalization normalizes all subset corpora with +% respect to each other. This process has the same effect as the second stage of +% normalization, but carried out over all subsets: +% each path type is averaged over every corpus; then each type is divided +% by that average. The final result is the same as the second normalization: +% each corpus consists of multiples of an average of $1.0$. The +% difference is that after the second normalization all types within a +% single subset corpus vary around an average of 1.0. After the third +% normalization, the values for a single type vary around an average of +% 1.0 across all subsets. + +% For example, +% assume that $s_a = 6$ and $s_b = 3$ from the previous example are +% members of a subset corpus $C_1$ and that another corpus $C_2$ +% has values $ss_a = 2$ and $ss_b = 3$ for the same path. Assume this +% path type has an average of 4 across all subsets. This gives a +% normalized value of $s'_a = 1.5$ and +% $ss'_a = 0.5$. Doing the same operation with an average $S_b = 3$ gives +% normalized values $s'_b = 1$ and $ss'_b = 1$. Remember, $s_a = 6$ means +% that $s_a$ appears 6 times more than the average path in its +% subset corpus. After the last normalization, $s_a = 1.5$ means that 6 is +% 1.5 times more than the average of this path across all +% subsets. Conversely, $ss_a$'s 2 times is only 0.5 the average for +% this path. + +\section{Experiment and Results} + +The experiment was run on the syntactically annotated part of the +International Corpus of English, Great Britain corpus (ICE-GB). +The syntactic annotation labels terminals with one of twenty parts of +speech and internal nodes with a category and a function +marker. Therefore, the leaf-ancestor paths each started at the root of +the sentence and ended with a part of speech. +For comparison to the experiment conducted by \newcite{nerbonne06}, the +experiment was also run with POS trigrams. Finally, a control +experiment was conducted by comparing two permutations from the same +corpus and ensuring that they were not significantly different. + +ICE-GB reports the place of birth of each speaker, which is the best +available approximation to which dialect a speaker uses. As a simple, +objective partitioning, the speakers were divided into 11 geographical +regions based on the 9 Government Office Regions of England with Wales +and Scotland added as single regions. Some speakers had to be thrown +out at this point because they lacked birthplace information or were +born outside the UK. Each region varied in size; however, the average +number of sentences per corpus was 4682, with an average of 44,726 +words per corpus (see table \ref{size}). Thus, the average +sentence length was 9.55 words. The average corpus was smaller than +the Norwegian L2 English corpora of \newcite{nerbonne06}, which had two +groups, one with 221,000 words and the other with 84,000. + +% NW=NW +% NE=Northumbria +% Yorkshire=Yorkshire/Humber +% East=East Anglia +% London=London +% Southeast=SE +% Southwest=West +% East-Midlands~=Middle England (except I think bigger) +% West-Midlands~=Heart of England (except smaller--nothing to S or E) +\begin{table} +\begin{tabular}{|lcc|} \hline +Region & sentences & words \\ +\hline \hline + East England & 855 & 10471 \\ \hline + East Midlands & 1944 & 16924 \\ \hline + London & 24836& 244341 \\ \hline + Northwest England & 3219 & 27070 \\ \hline + Northeast England & 1012 & 10199 \\ \hline + Scotland & 2886 & 27198 \\ \hline + Southeast England & 11090 & 88915 \\ \hline + Southwest England & 939 & 7107 \\ \hline + West Midlands & 960 & 12670 \\ \hline + Wales & 2338 & 27911 \\ \hline + Yorkshire & 1427 & 19092 \\ \hline +\end{tabular} +\caption{Subcorpus size} +\label{size} +\end{table} +%Priscilla Rasmussen, ACL, 209 N. Eighth Street, Stroudsburg, PA 18360 + +\begin{table} + \begin{tabular}{|c|c|} \hline % or use * ** *** notation + Region & Significantly different ($p < 0.05$) \\ \hline + London & East Midlands, NW England \\ + & SE England, Scotland \\ \hline + SE England & Scotland \\ \hline + \end{tabular} +\caption{Significant differences, leaf-ancestor paths} + \label{diffs} +\end{table} + +Significant differences (at $p < 0.05$) were found when +comparing the largest regions, but no significant differences were +found when comparing small regions to other small regions. The +significant differences found are given in table \ref{diffs} and +\ref{trigramdiffs}. It seems that summed corpus size must reach a +certain threshold before differences can be observed reliably: about 250,000 +words for leaf-ancestor paths and 100,000 for trigrams. There are exceptions in +both directions; the total size of London compared to Wales is larger than +the size of London +compared to the East Midlands, but the former is not statistically different. +On the other hand, the total size of Southeast England compared to +Scotland is only half of the other significantly different comparisons; this +difference may be a result of +more extreme syntactic differences than the other areas. +Finally, it is interesting to note that the summed Norwegian corpus +size is around 305,000 words, which is about three times the size needed +for significance as estimated from the ICE-GB data. + +\begin{table} + \begin{tabular}{|c|c|} \hline % or use * ** *** notation + Region & Significantly different ($p < 0.05$) \\ \hline + London & East Midlands, NW England, \\ + & NE England, SE England,\\ + & Scotland, Wales \\ \hline + SE England & London, East Midlands, \\ + & NW England, Scotland \\ \hline + Scotland & London, SE England, Yorkshire \\ \hline + \end{tabular} + \label{trigramdiffs} +\caption{Significant differences, POS trigrams} +\end{table} + +\section{Discussion} + +Our work extends that of \newcite{nerbonne06} in a number of ways. We +have shown that an alternate method of representing syntax still +allows the permutation test to find significant differences between +corpora. In addition, we have shown differences between corpora divided +by geographical area rather than language proficiency, with many more +corpora than before. Finally, we have shown that the size of the +corpus can be reduced somewhat and still obtain significant results. + +Furthermore, we also have shown that both leaf-ancestor paths and POS +trigrams give similar results, although the more complex paths require more data. + +However, there are a number of directions that this experiment should +be extended. A comparison that divides the speakers into traditional +British dialect areas is needed to see if the same differences can be +detected. This is very likely, because corpus divisions that better +reflect reality have a better chance of achieving a significant difference. + +In fact, even though leaf-ancestor paths should provide finer +distinctions than trigrams and thus require more data for detectable +significance, the regional corpora presented here were smaller than +the Norwegian speakers' corpora in \newcite{nerbonne06} by up to a factor of +10. This raises the question of a lower limit on corpus size. Our +experiment suggests that the two corpora must have at least 250,000 words, +although we suspect that better divisions will allow smaller corpus sizes. + +While we are reducing corpus size, we might as well compare the +increasing numbers of smaller and smaller corpora in an advantageous +order. It should be possible to cluster corpora by the point at which +they fail to achieve a significant difference when split from a +larger corpus. In this way, regions could be +grouped by their detectable boundaries, not a priori distinctions +based on geography or existing knowledge of dialect boundaries. + +Of course this indirect method would not be needed if one had a direct +method for clustering speakers, by distance or other +measure. Development of such a method is worthwhile research for the future. + +%THEND + +\bibliographystyle{acl} +\bibliography{central} +\end{document} + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: diff --git a/scrumph.org b/scrumph.org new file mode 100644 index 0000000..e22f391 --- /dev/null +++ b/scrumph.org @@ -0,0 +1,64 @@ +* Scrumph : Internal Scrum +** 2009/12/2 + - I re-arranged hypotheses. I marked up the methods intro with + section numbers. + - I will rewrite the headings of the methods section. I will + re-arrange methods intro so the section numbers look less + stupid. I will mail Sandra to set up a time on Friday to talk + about the new methods sections. + - Hypotheses section still sucks pretty bad. There is a lot of noise + text left to be excised before giving a draft to Sandra. +** 2009/12/3 + - I rewrote the headings of the method sections. I re-arranged the + methods intro so that at least the section 3 references are in + order (though nested within section 5 references). I mailed + Sandra and set up a time around 1:30. Maybe earlier; I will + probably go to the reading group. + - I will write stubs for unfinished method sections. I will rewrite + the hypotheses section to remove the noise. I will clean up the + whole thing for noise and send a copy to Sandra. +** 2009/12/4 + - I wrote stubs for the unfinished method sections. I rewrote the + hypotheses sections to remove noise, along with the whole thing, + and sent a copy to Sandra. + - I will meet with Sandra to ask her about the appropriateness of + the new subsections. I will expand the ones that I keep as much + as I can, then do additional research to find out what to put in + the others. + - I have a ton of errands to run. Probably I should turn in my + Swedish book soon and try to find one that actually has + linguists in mind. +** 2009/12/7 + - I met with Sandra, she gave me some advice. I did research on + kinds of backoff and wrote up a couple of the sections. + - I will finish research for alternate distance measures section, + write up all sections, and maybe start making them sound good. + - +** 2009/12/8 + - I finished research alternate distance measures section and wrote + up all sections. None of them sound particularly good. + - I will make all sections sound good. + - Some of the section sstill need a little research and some + citation (textbooks, mainly, though) +** 2009/12/9 + - I made all the sections sound good, except for the last sentence + of each one. ugh. I added citations from the appropriate papers + where they were missing. + - I will double-check the last stupid-sounding sentences and + re-read the whole methods section, then send to Sandra. The rest + of the day I will work on converting to git (installing on + peregrin if needed), resolving unicode problems in Consts.hs and + investigate the lexicalisation of the Berkeley parser. +** 2009/12/10 + - I added more than I thought I'd have to to the proposal, then + sent it off to Sandra. I switched to the more reliable way of + storing non-standard diacritic sequences in Haskell in Consts.hs. + - I will start testing the build process with tagPos, because it + calls Swedia.extractTnt, which I'm working on. I will verify that + both the Python and Haskell versions reproduce all the relevant + words from the interviews. + - Lexicalisation of the Berkeley parser (trainCfg) is delayed until + testing tagPos and tagDep are tested. +* TODO Analyse and make up todos based on notes from last talk +* TODO Test each step of build.py to make sure it produces the correct output. Start with Swedia.hs I guess +* TODO Compare output of Swedia.hs and swedia.py. Haskell port doesn't look right yet. diff --git a/test_sed.py b/test_sed.py new file mode 100644 index 0000000..a166901 --- /dev/null +++ b/test_sed.py @@ -0,0 +1,285 @@ +## from unittest import TestCase, TestSuite, TestLoader, TextTestRunner +import tester +from util.lst import * +from sed import * +def checktype(f): + args,res = typ(f).split("->") + return res.strip()==typ(f(*map(mak,args.strip().split("*")))) +### data ### +def testRegions(self): + # 1. all of the values are in the range 315 + test(set(concat(regions.values())) - set(range(1,315)), set()) + # 2. the only skipped rows are Isle of Man and region 23 + # (which is now in Wales), plus row 1 which holds the word titles + test(set(range(1,315)) ^ set(concat(regions.values())), + set([1, 75, 76, 191, 192])) + #TODO: Check that each region has the correct number of sites. + # London has apparently only two regions here. + # Also check correlation between similarity and number of sites. + # This needs to be normalised for + test(dct.map(len, regions), + dict(zip(['em', 'ld', 'ee', 'wm', 'sw', 'yk', 'ne', 'se', 'nw'], + [ 58, 2, 40, 31, 58, 34, 15, 41, 30]))) +### read CSV ### + # extract, curried, takewhile, dropwhile, split_by ought all to be + # added to util. Since str is not [char] in Python, there need to be + # two versions of takewhile and dropwhile. This is necessary in Scheme + # for the same reason. + # extract also needs two versions for most languages +def testLstExtract(self): + test(lst_extract([], []), []) + test(lst_extract(range(10), []), []) + # this version allows multiple identical values + test(lst_extract(range(10), [0,0,0]), [0,0,0]) + test(lst_extract(range(10), [0,1,2]), [0,1,2]) + # and out-of-order values + test(lst_extract(range(10), [2,0,1]), [2,0,1]) + test(lst_extract(range(10), range(10)), range(10)) + test(lst_extract(range(10), reversed(range(10))), + list(reversed(range(10)))) + # but this ability may be superfluous in a portable, efficient version +def testCurried(self): + # trivial + f = lambda: 12 + test(f(), 12) + test(curried(f)(), 12) + add1 = lambda n:n+1 + test(add1(2), 3) + test(curried(add1)(3), 4) + add = lambda n,m:n+m + test(add(2,3), 5) + test(curried(add)(2)(3), 5) + curried(add)(2) + self.assertRaises(TypeError, curried(add)(2), 3, 4) + self.assertRaises(TypeError, curried(add), 2, 3) +def testTakewhile(self): + test(takewhile(lambda _:True)("foo"), "foo") + test(takewhile(lambda _:False)("foo"), "") + test(takewhile(lambda c:c=='c')("foo"), "") + test(takewhile(lambda c:c!='c')("foo"), "foo") + test(takewhile(lambda c:c=='f')("foo"), "f") + test(map(takewhile(lambda c:c!='-'), ["foo-bar", "barbaz", "---"]), + ['foo', 'barbaz', '']) +def testDropwhile(self): + test(dropwhile(lambda _:True)("foo"), "") + test(dropwhile(lambda _:False)("foo"), "foo") + test(dropwhile(lambda c:c=='c')("foo"), "foo") + test(dropwhile(lambda c:c!='c')("foo"), "") + test(dropwhile(lambda c:c=='f')("foo"), "oo") + test(map(dropwhile(lambda c:c!='-'), ["foo-bar", "barbaz", "---"]), + ['-bar', '', '---']) +def testMapc(self): + test(mapc(lambda _:True)(range(3)), [True, True, True]) + test(mapc(lambda n:n+1)(range(3)), range(1,4)) + test(map(mapc(lambda n:n+1), [range(3), range(4,6), range(3,9)]), + [range(1,4), range(5,7), range(4,10)]) +### read CSV ### +# this is rickety ad-hoc code, so the purpose of these tests is NOT to stress +# the code; it's essentially throw-away, conformed to the shape of the data +# anyway. These tests are meant to make sure the output of group_regions +# is exactly the data structure I expect. This is tested using fake data. +fake = map(list, transpose(['hey aV1L redC1C redV1L carV1H carV1L'.split(), + map(str, range(1,7)), + map(str, range(11,17)), + map(str, range(111,117))])) +fake_regions = {'e':[2,3], 'f':[4]} +grouped_sed = group_sed_in_gor() +# answer = analyse(grouped_sed) # this makes the test run VERY slowly +# because the real analysis takes about a minute or so. +def testGroupWords(self): + answer = group_words(fake) + test(len(fake), 6) + test(typ(fake), '[[str]]') + test(typ(answer), "{str:{str:{str:[float]}}}") + test(set(answer.keys()), set('a red car'.split())) + test(set(answer['red'].keys()), set(["C1", "V1"])) + test(answer["red"]["C1"].keys(), ["C"]) + test(answer["red"]["C1"]["C"], [3.0, 13.0, 113.0]) +def testGroupRegions(self): + answer = group_regions(fake_regions, group_words(fake)) + test(typ(fake_regions), "{str:[int]}") + test(typ(answer), "{str:{str:{str:{str:[float]}}}}") + test(set(answer.keys()), set('ef')) + test(answer, + {'e':{'a': {"V1":{"L":[2,12]}}, + 'red': {"C1":{"C":[3,13]}, "V1":{"L":[4,14]}}, + 'car': {"V1":{"H":[5,15], "L":[6,16]}}}, + 'f':{'a': {"V1":{"L":[112]}}, + 'red': {"C1":{"C":[113]}, "V1":{"L":[114]}}, + 'car': {"V1":{"H":[115], "L":[116]}}}}) +def testGroupSedInGor(self): + # is't the right type? + test(typ(grouped_sed), "{str:{str:{str:{str:[float]}}}}") + # did it work + test('ee' in grouped_sed, True) + test('wmen' in grouped_sed['ee'], True) + test('V1' in grouped_sed['ee']['wmen'], True) + test(grouped_sed["ee"]["wmen"]["V1"].keys(), ["H"]) +def testFlatten(self): + flat = flatten(grouped_sed) + fake = {'a':{'a':{'a':{'a':[1.0,2.0,3.0]}, + 'b':{'b':[0.0,12.0,1.1]}, + 'c':{'0':[0.0]}}}} + test(typ(flat), "[[[{str:[float]}]]]") + test(flat[0][51][0], grouped_sed["ee"]["wmen"]["V1"]) + test(flatten(fake), [[[{'a':[1.0,2.0,3.0]}, + {'b':[0.0,12.0,1.1]}, + {'0':[0.0]}]]]) +def testFeatureSub(self): + # no-op + test(feature_sub({},{}), 0.0) + test(feature_sub({'a':[1.0]},{}), 1.0) + test(feature_sub({},{'a':[1.0]}), 1.0) + test(feature_sub({'a':[]}, {'a':[1.0]}), 0.0) + # unshared features + test(feature_sub({'a':[1.0]},{'b':[1.0]}), 2.0) + # shared features + test(feature_sub({'a':[1.0]},{'a':[1.0]}), 0.0) + test(feature_sub({'a':[1.0]},{'a':[0.0]}), 1.0) + test(feature_sub({'a':[1.0]},{'a':[0.5]}), 0.5) + # -cross + test(feature_sub({'a':[1.0]},{'a':[0.5,1.0]}), 0.25) + test(feature_sub({'a':[1.0]},{'a':[0.5,0.0]}), 0.75) + test(feature_sub({'a':[1.0,0.5]},{'a':[0.5,0.0]}), 0.5) + test(feature_sub({'a':[1.0,2.0]},{'a':[0.5,0.0]}), 1.25) + # -avg + test(feature_sub({'a':[1.0],'b':[0.0,1.0]}, + {'a':[0.5,0.0],'b':[0.5]}), 1.25) + # whole fish execution! + test(feature_sub({'a':[1.0],'b':[0.0,1.0],'c':[0.0]}, + {'a':[0.5,0.0],'b':[0.5],'d':[]}), 3.25) + test(feature_sub({'a':[1.0],'b':[0.0,3.0],'c':[0.0]}, + {'a':[0.5,0.0],'b':[5.0],'d':[]}), 6.25) + # imbalanced number of informants + test((feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5], 'b':[0.5] }) == + feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5,0.0],'b':[0.5,1.0]}) == + feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5,1.0,0.0],'b':[0.5,0.0,1.0]})), + True) + test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5], 'b':[0.5] }), + 1.0) + test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5,0.0],'b':[0.5,1.0]}), + 1.0) + test(feature_sub({'a':[1.0,0.0],'b':[0.0,1.0]}, + {'a':[0.5,1.0,0.0],'b':[0.5,0.0,1.0]}), + 1.0) + # = 0 unshared features + avg(sum(|0.5-1.0| + |0.5-0.1| + .. + | + #NOTE: I don't think this is quite right. The missing features have way too + # much weight, so when C/V are compared (which happens often) they will + # quickly outweigh the other data. What is a good weight? + #total_feature_count = len(fs1+fs2) + # then maybe ?? + #unshared_weight = (1/total_feature_count) * len(set(fs1) ^ set(fs2)) + # ne and ld are the smallest +def testSedAvg(self): + test(checktype(sed_avg), True) + test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], []), 0.0) + test(sed_avg([], [{"a":[0.0,1.0],"b":[0.3,3.0]}]), 0.0) + test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], [{'a':[0.0,1.0]}]), 1.5) + # is the cross >>= avg really working? + # that is, can we duplicate existing data with no change to the average? + test(sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], + [{'a':[0.0,1.0]},{'a':[1.0,2.0]}, + {'a':[0.0,1.0]},{'a':[1.0,2.0]}]), + sed_avg([{"a":[0.0,1.0],"b":[0.3,3.0]}], + [{'a':[0.0,1.0]},{'a':[1.0,2.0]}])) + # OK, so uhhh .. is that enough? I don't know. +def testSedAvgTotal(self): + test(checktype(sed_avg_total), True) + test(sed_avg_total(([[]], [{"a":[0.0,1.0],"b":[0.3,3.0]}])), 0.0) + test(sed_avg_total(([[{"a":[0.0,1.0],"b":[0.3,3.0]}]], + [[{'a':[0.0,1.0]}]])), + 0.75) + # average of multiple items remains the same + test(sed_avg_total(([[{"a":[0.0,1.0],"b":[0.3,3.0]}], + [{"a":[0.0,1.0],"b":[0.3,3.0]}]], + [[{'a':[0.0,1.0]}],[{'a':[0.0,1.0]}]])), + 0.75) + # the number of words in each region must be the same + self.assertRaises(TypeError, + sed_avg_total, + ([[{"a":[0.0,1.0],"b":[0.3,3.0]}], + [{"a":[0.0,1.0],"b":[0.3,3.0]}]], + [[{'a':[0.0,1.0]}],[{'a':[0.0,1.0]}],[{'b':[3.0]}]])) +def testSedLevenshtein(self): + # same number of characters but some different features + test(sed_levenshtein(([{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]), + 5.0), + 3.5) + test(lev._levenshtein([{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}], + 5.0, + (lambda _:5.0,lambda _:5.0,feature_sub)), + [[0.0,5.0,10.0], + [5.0,1.5,6.5], + [10.0,6.25,3.5]]) + # identical features but different values + test(sed_levenshtein(([{'a':[0.0,1.0]}, {'b':[1.0,1.5]}], + [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}]), + 5.0), + 0.75) + test(lev._levenshtein([{'a':[0.0,1.0]}, {'b':[1.0,1.5]}], + [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}], + 5.0, + (lambda _:5.0,lambda _:5.0,feature_sub)), + [[0.0,5.0,10.0], + [5.0,0.5,5.5], + [10.0,5.5,0.75]]) +def testSedDistance(self): + # same as sed_levenshtein with only one region + test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[0.0,0.0]}, {'a':[0.0,0.5]}]], + [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]), + 5.0), + 3.5) + # zip ignores unmatched extras + test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[0.0,0.0]}, {'a':[0.0,0.5]}]], + [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]), + 5.0), + 3.5) + # sum of two identical things is twice the original + test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[0.0,1.0]}, {'a':[1.0,1.5]}]], + [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}], + [{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}]]), + 5.0), + 7.0) + # sum works otherwise (3.5 + 0.75 from sed_levenshtein) + test(sed_distance(([[{'a':[0.0,1.0]}, {'a':[1.0,1.5]}], + [{'a':[0.0,1.0]}, {'b':[1.0,1.5]}]], + [[{'a':[1.0,1.0], 'b':[0.0,1.5]}, {'b':[1.0,1.2]}], + [{'a':[1.0,1.0]}, {'b':[1.0,1.2]}]]), + 5.0), + 4.25) +def testAnalyse(self): + # is it correct to use different averages for each pair? + # it doesn't make any difference for the actual results so I guess + # it doesn't matter. + # averages = map(sed_avg_total, lst.all_pairs(flatten(grouped_sed))), + # test(stdev(averages) < 1.0, True) + pass +def testMak(self): + test(mak('bool'), False) + test(mak('int'), 0) + test(mak('float'), 0.0) + test(mak('str'), '') + test(mak('[int]'), [0]) + test(mak('[bool]'), [False]) + test(mak('{str:int}'), {'':0}) + test(mak('(int,int,str)'), (0,0,'')) + self.assertRaises(ValueError, mak, '[int}') + self.assertRaises(ValueError, mak, '{int}') # ironically, of course, this + # is valid Python 3.0 syntax. (this is the real reason Guido wanted to + # get rid of lambda) + self.assertRaises(ValueError, mak, '[]') + self.assertRaises(ValueError, mak, '{}') + self.assertRaises(ValueError, mak, '{:}') + self.assertRaises(ValueError, mak, '()') + +tester.runTest(__import__(__name__), locals())