Added a few missing files, plus moved a bunch of old files out of the…

… repo directory. Massive additions to .gitignore
sandersn · Dec 10, 2009 · 729ee92 · 729ee92
1 parent 3067378
commit 729ee92
Show file tree

Hide file tree

Showing 10 changed files with 1,371 additions and 0 deletions.
diff --git a/ACL07_poster_equations.tex b/ACL07_poster_equations.tex
@@ -0,0 +1,25 @@
+\documentclass[11pt]{article}
+\usepackage[vcentering]{geometry} % dvips not needed for pdflatex
+% \usepackage[vcentering,dvips]{geometry}
+% % eqn 1
+% \geometry{papersize={20mm,15mm},total={30mm,24mm}}
+% % eqn 1
+% \geometry{papersize={50mm,18mm},total={65mm,26mm}}
+% % eqn 1
+% \geometry{papersize={40mm,18mm},total={52mm,26mm}}
+% eqn 1
+\geometry{papersize={68mm,18mm},total={85mm,26mm}}
+\begin{document}
+\thispagestyle{empty}
+% \[f = \frac{c}{N}\]
+
+% \[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\]
+
+% \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\]
+
+\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\]
+\end{document}
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: t
+%%% End: 
diff --git a/Lev.hs b/Lev.hs
@@ -0,0 +1,22 @@
+module Lev where
+import qualified Data.Map as Dct
+import Control.Monad (liftM)
+enum = zip [0..]
+takeFail 0 _ = Just []
+takeFail _ [] = Nothing
+takeFail n (x:xs) = liftM (x:) (takeFail (n - 1) xs)
+window n l = case takeFail n l of
+             Nothing -> []
+             Just xs -> xs : window n (tail l)
+-- _levenshtein :: (Enum b, Num b, Ord b) =>
+                -- [t] -> [a] -> b -> (a -> b, t -> b, a -> t -> b) -> [b]
+_levenshtein ss ts indel (ins,del,sub) =
+    let initial l = [0,indel..indel * fromIntegral (length l)]
+    in foldl (\ table (i,s) ->
+              (foldl (\ row (t,[prev,prev']) ->
+                      minimum [ins t + head row, del s + prev', sub s t + prev]
+                      : row)
+                     [i]
+-- doesn't work without double reverse. i suspect only (window 2) needs reverse
+                     (zip ts . window 2 {- . reverse-} $ table)))
+             (initial ts) {-(reverse (initial ts))-} (zip (tail (initial ss)) ss)
diff --git a/Main.hs b/Main.hs
@@ -0,0 +1,4 @@
+module Main where
+import qualified Sed
+main :: IO ()
+main = print . Sed.analyse =<< Sed.groupSedInGor
diff --git a/Sed.hs b/Sed.hs
@@ -0,0 +1,90 @@
+module Sed where
+import Text.Regex.Posix
+import Text.CSV
+import Char
+import qualified Data.Map as Dct
+import Data.List
+import Data.Ord
+import qualified Lev
+--- util ---
+(&) = flip (.)
+dctCollapse xs k v = Dct.fromAscListWith
+                       (++)
+                       (sortBy (comparing fst) [(k x, [v x]) | x <- xs])
+listExtract ns xs = extract ns xs 0
+    where extract [] _ _ = []
+          extract (n:ns) xs i =
+              xs !! (n - i) : extract ns (drop (n - i) xs) n
+kross (xs,ys) = do x <- xs; y <- ys; return (x,y)
+both f (x,y) = (f x, f y)
+pairs [] = []
+pairs (x:xs) = [(x,y) | y <- xs] ++ pairs xs
+average l = sum l / fromIntegral (length l)
+--- read CSV ---
+segment = head & dropWhile isLower & segmentName
+    where segmentName s = seg++n where (seg,n,_) =
+                                           s =~ "[0-9]" :: (String,String,String)
+features (title:ns) = (feature title, map read ns)
+    where feature s = feat where (_,_,feat) =
+                                     s =~ "[0-9]" :: (String,String,String)
+groupWords csv = Dct.map (fillsegments . phones) words
+    where words = dctCollapse (tail csv) (head & takeWhile isLower) id
+          fillsegments = Dct.mapWithKey makesegment
+          phones l = Dct.map Dct.fromList (dctCollapse l segment features)
+makesegment typ d =
+    let size = length . head . Dct.elems $ d
+    in d `Dct.union` Dct.map (replicate size) (featdict Dct.! init typ)
+groupRegions regions words = Dct.map outermost regions
+    where outermost range = Dct.map inner words
+              where inner = Dct.map (Dct.map (listExtract (map ((-) 2) range)))
+groupSedInGor = do
+  csv <- parseCSVFromFile "sed.csv"
+  case csv of
+    Left err -> error ("oh no:" ++ show err)
+    Right rows -> return $ groupRegions regions $ groupWords $ transpose rows
+--- analysis ---
+flatten = map (map Dct.elems . Dct.elems) . Dct.elems
+analyse sed = Dct.fromList . zip edges . map (sedDistance avgregions) $ regions
+    where edges = pairs (Dct.keys sed)
+          regions = pairs (flatten sed)
+          avgregions = average (map sedAvgTotal regions)
+featureSub seg1 seg2 = fromIntegral (Dct.size(seg1 `symmetric_difference` seg2))
+                       + sum (map abs (Dct.elems (Dct.intersectionWith (-) seg1 seg2)))
+  where symmetric_difference d e = Dct.union (e `Dct.difference` d)
+                                             (d `Dct.difference` e)
+sedDistance avg = sum . map (sedLevenshtein avg) . uncurry zip
+transposeWord word = transpose (map transposeSegment word)
+    where transposeSegment seg = map (Dct.fromList . zip (Dct.keys seg))
+                                     (transpose (Dct.elems seg))
+sedLevenshtein a = average . map (levenshtein a) . kross . both transposeWord
+levenshtein a (w1,w2) =
+    head $ Lev._levenshtein w1 w2 a (const a, const a, featureSub)
+sedAvg :: (Ord k, Fractional a) => ([Dct.Map k [a]], [Dct.Map k [a]]) -> a
+sedAvg = both (concat . transposeWord) & kross & map (uncurry featureSub) & average
+sedAvgTotal (region1,region2) = average (map sedAvg (zip region1 region2)) / 2.0
+--- data ---
+featdict = Dct.fromList [("C", Dct.fromList [("GL",0.0), ("V",0.0), ("H",0.0),
+                                             ("PV",0.0), ("L",0.0)]),
+                         ("V", Dct.fromList [("B",1.0), ("H",1.0),
+                                             ("L",1.0), ("R",1.0)]),
+                         ("R", Dct.fromList [("MN",1.5), ("PL",1.0)]),
+                         ("MULT", Dct.fromList [("MULT", 1.0)]),
+                         ("VC", Dct.empty)]
+regions :: Dct.Map String [Int]
+regions = Dct.fromList [("ne", [2..11]++[17..23]),
+                        ("nw", [11..17]++[23..41]++[77..83]),
+                        ("yk", [41..75]),
+                        -- ++[75..77] (Isle of Man isn't on GOR map)
+                        ("wm", [112..126]++[140..157]),
+                        ("em", [83..112]++[126..140]++[157..172]),
+                        ("ee", [172..191]++[217..238]),
+                        ("se", [206..217]++[262..279]++[302..315]),
+                        ("sw", [193..206]++[240..262]++[279..302]),
+                        ("ld", [238..240])]
+test = [["", "applV1H", "applV1L", "applC1GL", "applV2", "catcV1H", "askMULT0MULT", "askV1H", "askV1B"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"],
+        ["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"]]
diff --git a/nord/RepairTalBanken.hs b/nord/RepairTalBanken.hs
@@ -0,0 +1,56 @@
+------- yet another uncrosser ------------
+example = Node ("S", 555)
+          [Node ("VP",510)
+           [Node ("NP",502)
+            [Leaf ("DET",0) ("Das",0), Leaf ("N",1) ("Buch",1), Leaf ("PRON",4) ("ihm",4)]
+           , Leaf ("V",2) ("gegeben",2)
+           , Leaf ("PART",5) ("habe",5)]
+          , Node ("NP",501) [Leaf ("PRON",3) ("ich",3)]]
+sexample = spanTree example
+spanTree :: Tree (String, Integer) -> Tree (Integer, Integer)
+spanTree (Leaf (_,i) _) = Leaf (i,i+1) (i,i+1)
+spanTree (Node _ kids) = Node (minimum starts, maximum ends) trees
+    where trees = map spanTree kids
+          starts = map (fst . dat) trees
+          ends = map (snd . dat) trees
+uncross' :: [Tree (Integer,Integer)] -> [Tree (Integer,Integer)]
+uncross' [] = []
+uncross' (Leaf a w : siblings) = Leaf a w : uncross' siblings
+uncross' (Node a kids : siblings) = uncross''.both depair.span continuous.pairs
+                                    $ kids
+    where uncross'' (co,[]) = co ++ uncross' siblings
+          uncross'' (co,disco) = co ++ uncross' (insert siblings disco)
+pairs l = zip l (tail l)
+both f (x,y) = (f x, f y)
+continuous (t, t') = snd (dat t) == fst (dat t')
+depair l = (fst $ head l) : map snd l
+insert = (++) -- insert has to stick disco in siblings somewhere and then uncross
+         -- it all. Not necessarily in that order.
+{-uncross (Node a kids) = Node a (uncross' kids)
+uncross l = l
+uncross' :: [Siblings] -> [Siblings] -- but uncrossed
+-- OK the problem is that insert might need to drop disco down a couple of levels into siblings
+-- in other words, the first step is the check what siblings disco belongs IN or AFTER
+-- then you may have to insert down, ie repeat the insert for the chosen sibling's kids
+-- ... told you there might be a lot of consing!
+insert siblings disco = let (before,actual:after) = splitBy ((lhs disco) >) siblings in
+    if rhs disco > lhs actual then -- or something like this
+       before ++ actual : disco ++ after -- um..you get the idea
+    else
+       before ++ (insert (kids actual) disco : after) -- whoo CONS! -}
+       -- also this recursive step should do some uncrossing of before and after, right?
+
+{- The idea is that you start at the leftmost kid of a Node.
+   You take as much as is continuous and you cons that onto the rest of the siblings+disco
+   after that has all been uncrossed.
+   co : uncross (insert disco siblings)
+   except that uncross has to take additional arguments?
+   also some uncrossings may have to burrow arbitrarily deep. I'm not sure of the limit yet.
+   Anyway you'd have to do it either way, bottom-up or top-down, so at least top-down it's
+   easier to maintain pointers to it all even if there is a lot of consing involved.
+
+   actually now that I sleep on it, I'm not sure about arbitrary deepness. I think that's
+   needed only if you want to try Wolfgang's bottom-up style. Adriane Boyd's split style
+   just attaches the disco part as a sibling to the co part I think.
+-}
+T
diff --git a/nord/TestDistance.hs b/nord/TestDistance.hs
@@ -0,0 +1,49 @@
+import Distance hiding (main)
+import Test.QuickCheck.Batch
+import Test.QuickCheck
+import Control.Arrow ((&&&))
+import Util
+import Data.List (group, sort, nub)
+import Data.List.Split (splitOn)
+import qualified Data.Map as Map
+{- instance Arbitrary Char where
+    arbitrary     = choose ('\32', '\128')
+    coarbitrary c = variant (ord c `rem` 4) -}
+
+exampleLines = splitOn "\n" "Morgoth\na\nb\n***\na\nc"
+
+prop_histogram_list :: [Int] -> Bool -- oops, this is a test for Util.
+prop_histogram_list l = Map.toList (histogram l) == listhist l
+  where listhist = sort & group & map (head &&& length)
+prop_histogram_empty :: [Int] -> Bool
+prop_histogram_empty l = (l == []) == (histogram l == Map.empty)
+
+prop_r_empty :: Bool
+prop_r_empty = r [] == 0.0
+prop_r_one = r [(1,2)] == 1.0
+prop_r_two = r [(1,2), (10,20)] == 11.0
+
+prop_cmp_truncate :: [Int] -> [Int] -> Property
+prop_cmp_truncate r1 r2 = (r2 /= [] && r1 /= []) ==>
+                          length (cmp r1 r2) == length (nub r2)
+prop_cmp_zeroes_r1 :: [Int] -> [Int] -> Property
+prop_cmp_zeroes_r1 r1 r2 = (r2 /= [] && r1 /= []) ==>
+  countBy (/=0.0) (map fst rcompare)
+    == Map.size (histogram r1 `Map.intersection` histogram r2)
+  where rcompare = cmp r1 r2
+prop_cmp_iterate :: [Int] -> [Int] -> Property
+prop_cmp_iterate r1 r2 = (r1 /= [] && r2 /= []) ==> all (\ ((a,b),(c,d)) -> abs (a - b) < abs (c - d)) $ zip (cmp r1 r2) $ map (\ (f,n) -> (fromIntegral $ Map.findWithDefault 0 f (histogram r1), fromIntegral n)) (Map.toList $ histogram r2)
+  where rcompare = cmp r1 r2
+main = runTests
+         "The Basics"
+         TestOptions {no_of_tests = 100
+                     ,length_of_tests = 1
+                     , debug_tests = False}
+         [run prop_histogram_list
+         , run prop_histogram_empty
+         , run prop_r_empty
+         , run prop_r_one
+         , run prop_r_two
+         , run prop_cmp_truncate
+         , run prop_cmp_zeroes_r1
+         , run prop_cmp_iterate]