-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a few missing files, plus moved a bunch of old files out of the…
… repo directory. Massive additions to .gitignore
- Loading branch information
Showing
10 changed files
with
1,371 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
\documentclass[11pt]{article} | ||
\usepackage[vcentering]{geometry} % dvips not needed for pdflatex | ||
% \usepackage[vcentering,dvips]{geometry} | ||
% % eqn 1 | ||
% \geometry{papersize={20mm,15mm},total={30mm,24mm}} | ||
% % eqn 1 | ||
% \geometry{papersize={50mm,18mm},total={65mm,26mm}} | ||
% % eqn 1 | ||
% \geometry{papersize={40mm,18mm},total={52mm,26mm}} | ||
% eqn 1 | ||
\geometry{papersize={68mm,18mm},total={85mm,26mm}} | ||
\begin{document} | ||
\thispagestyle{empty} | ||
% \[f = \frac{c}{N}\] | ||
|
||
% \[\forall j \in a,b : c'_{ji} = \frac{f_{ji}(c_{ai}+c_{bi})}{f_{ai}+f_{bi}}\] | ||
|
||
% \[\forall j \in a,b : s_{ji} = \frac{2nc_{ji}'}{N}\] | ||
|
||
\[ R = \Sigma_i |c_{ai} - \bar{c_i}| \textrm{ where } \bar{c_i} = \frac{c_{ai} + c_{bi}}{2}\] | ||
\end{document} | ||
%%% Local Variables: | ||
%%% mode: latex | ||
%%% TeX-master: t | ||
%%% End: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
module Lev where | ||
import qualified Data.Map as Dct | ||
import Control.Monad (liftM) | ||
enum = zip [0..] | ||
takeFail 0 _ = Just [] | ||
takeFail _ [] = Nothing | ||
takeFail n (x:xs) = liftM (x:) (takeFail (n - 1) xs) | ||
window n l = case takeFail n l of | ||
Nothing -> [] | ||
Just xs -> xs : window n (tail l) | ||
-- _levenshtein :: (Enum b, Num b, Ord b) => | ||
-- [t] -> [a] -> b -> (a -> b, t -> b, a -> t -> b) -> [b] | ||
_levenshtein ss ts indel (ins,del,sub) = | ||
let initial l = [0,indel..indel * fromIntegral (length l)] | ||
in foldl (\ table (i,s) -> | ||
(foldl (\ row (t,[prev,prev']) -> | ||
minimum [ins t + head row, del s + prev', sub s t + prev] | ||
: row) | ||
[i] | ||
-- doesn't work without double reverse. i suspect only (window 2) needs reverse | ||
(zip ts . window 2 {- . reverse-} $ table))) | ||
(initial ts) {-(reverse (initial ts))-} (zip (tail (initial ss)) ss) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
module Main where | ||
import qualified Sed | ||
main :: IO () | ||
main = print . Sed.analyse =<< Sed.groupSedInGor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
module Sed where | ||
import Text.Regex.Posix | ||
import Text.CSV | ||
import Char | ||
import qualified Data.Map as Dct | ||
import Data.List | ||
import Data.Ord | ||
import qualified Lev | ||
--- util --- | ||
(&) = flip (.) | ||
dctCollapse xs k v = Dct.fromAscListWith | ||
(++) | ||
(sortBy (comparing fst) [(k x, [v x]) | x <- xs]) | ||
listExtract ns xs = extract ns xs 0 | ||
where extract [] _ _ = [] | ||
extract (n:ns) xs i = | ||
xs !! (n - i) : extract ns (drop (n - i) xs) n | ||
kross (xs,ys) = do x <- xs; y <- ys; return (x,y) | ||
both f (x,y) = (f x, f y) | ||
pairs [] = [] | ||
pairs (x:xs) = [(x,y) | y <- xs] ++ pairs xs | ||
average l = sum l / fromIntegral (length l) | ||
--- read CSV --- | ||
segment = head & dropWhile isLower & segmentName | ||
where segmentName s = seg++n where (seg,n,_) = | ||
s =~ "[0-9]" :: (String,String,String) | ||
features (title:ns) = (feature title, map read ns) | ||
where feature s = feat where (_,_,feat) = | ||
s =~ "[0-9]" :: (String,String,String) | ||
groupWords csv = Dct.map (fillsegments . phones) words | ||
where words = dctCollapse (tail csv) (head & takeWhile isLower) id | ||
fillsegments = Dct.mapWithKey makesegment | ||
phones l = Dct.map Dct.fromList (dctCollapse l segment features) | ||
makesegment typ d = | ||
let size = length . head . Dct.elems $ d | ||
in d `Dct.union` Dct.map (replicate size) (featdict Dct.! init typ) | ||
groupRegions regions words = Dct.map outermost regions | ||
where outermost range = Dct.map inner words | ||
where inner = Dct.map (Dct.map (listExtract (map ((-) 2) range))) | ||
groupSedInGor = do | ||
csv <- parseCSVFromFile "sed.csv" | ||
case csv of | ||
Left err -> error ("oh no:" ++ show err) | ||
Right rows -> return $ groupRegions regions $ groupWords $ transpose rows | ||
--- analysis --- | ||
flatten = map (map Dct.elems . Dct.elems) . Dct.elems | ||
analyse sed = Dct.fromList . zip edges . map (sedDistance avgregions) $ regions | ||
where edges = pairs (Dct.keys sed) | ||
regions = pairs (flatten sed) | ||
avgregions = average (map sedAvgTotal regions) | ||
featureSub seg1 seg2 = fromIntegral (Dct.size(seg1 `symmetric_difference` seg2)) | ||
+ sum (map abs (Dct.elems (Dct.intersectionWith (-) seg1 seg2))) | ||
where symmetric_difference d e = Dct.union (e `Dct.difference` d) | ||
(d `Dct.difference` e) | ||
sedDistance avg = sum . map (sedLevenshtein avg) . uncurry zip | ||
transposeWord word = transpose (map transposeSegment word) | ||
where transposeSegment seg = map (Dct.fromList . zip (Dct.keys seg)) | ||
(transpose (Dct.elems seg)) | ||
sedLevenshtein a = average . map (levenshtein a) . kross . both transposeWord | ||
levenshtein a (w1,w2) = | ||
head $ Lev._levenshtein w1 w2 a (const a, const a, featureSub) | ||
sedAvg :: (Ord k, Fractional a) => ([Dct.Map k [a]], [Dct.Map k [a]]) -> a | ||
sedAvg = both (concat . transposeWord) & kross & map (uncurry featureSub) & average | ||
sedAvgTotal (region1,region2) = average (map sedAvg (zip region1 region2)) / 2.0 | ||
--- data --- | ||
featdict = Dct.fromList [("C", Dct.fromList [("GL",0.0), ("V",0.0), ("H",0.0), | ||
("PV",0.0), ("L",0.0)]), | ||
("V", Dct.fromList [("B",1.0), ("H",1.0), | ||
("L",1.0), ("R",1.0)]), | ||
("R", Dct.fromList [("MN",1.5), ("PL",1.0)]), | ||
("MULT", Dct.fromList [("MULT", 1.0)]), | ||
("VC", Dct.empty)] | ||
regions :: Dct.Map String [Int] | ||
regions = Dct.fromList [("ne", [2..11]++[17..23]), | ||
("nw", [11..17]++[23..41]++[77..83]), | ||
("yk", [41..75]), | ||
-- ++[75..77] (Isle of Man isn't on GOR map) | ||
("wm", [112..126]++[140..157]), | ||
("em", [83..112]++[126..140]++[157..172]), | ||
("ee", [172..191]++[217..238]), | ||
("se", [206..217]++[262..279]++[302..315]), | ||
("sw", [193..206]++[240..262]++[279..302]), | ||
("ld", [238..240])] | ||
test = [["", "applV1H", "applV1L", "applC1GL", "applV2", "catcV1H", "askMULT0MULT", "askV1H", "askV1B"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"], | ||
["", "1.0", "2.0", "3.0", "0.0", "3.0", "1.0", "2.0", "2.0"]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
------- yet another uncrosser ------------ | ||
example = Node ("S", 555) | ||
[Node ("VP",510) | ||
[Node ("NP",502) | ||
[Leaf ("DET",0) ("Das",0), Leaf ("N",1) ("Buch",1), Leaf ("PRON",4) ("ihm",4)] | ||
, Leaf ("V",2) ("gegeben",2) | ||
, Leaf ("PART",5) ("habe",5)] | ||
, Node ("NP",501) [Leaf ("PRON",3) ("ich",3)]] | ||
sexample = spanTree example | ||
spanTree :: Tree (String, Integer) -> Tree (Integer, Integer) | ||
spanTree (Leaf (_,i) _) = Leaf (i,i+1) (i,i+1) | ||
spanTree (Node _ kids) = Node (minimum starts, maximum ends) trees | ||
where trees = map spanTree kids | ||
starts = map (fst . dat) trees | ||
ends = map (snd . dat) trees | ||
uncross' :: [Tree (Integer,Integer)] -> [Tree (Integer,Integer)] | ||
uncross' [] = [] | ||
uncross' (Leaf a w : siblings) = Leaf a w : uncross' siblings | ||
uncross' (Node a kids : siblings) = uncross''.both depair.span continuous.pairs | ||
$ kids | ||
where uncross'' (co,[]) = co ++ uncross' siblings | ||
uncross'' (co,disco) = co ++ uncross' (insert siblings disco) | ||
pairs l = zip l (tail l) | ||
both f (x,y) = (f x, f y) | ||
continuous (t, t') = snd (dat t) == fst (dat t') | ||
depair l = (fst $ head l) : map snd l | ||
insert = (++) -- insert has to stick disco in siblings somewhere and then uncross | ||
-- it all. Not necessarily in that order. | ||
{-uncross (Node a kids) = Node a (uncross' kids) | ||
uncross l = l | ||
uncross' :: [Siblings] -> [Siblings] -- but uncrossed | ||
-- OK the problem is that insert might need to drop disco down a couple of levels into siblings | ||
-- in other words, the first step is the check what siblings disco belongs IN or AFTER | ||
-- then you may have to insert down, ie repeat the insert for the chosen sibling's kids | ||
-- ... told you there might be a lot of consing! | ||
insert siblings disco = let (before,actual:after) = splitBy ((lhs disco) >) siblings in | ||
if rhs disco > lhs actual then -- or something like this | ||
before ++ actual : disco ++ after -- um..you get the idea | ||
else | ||
before ++ (insert (kids actual) disco : after) -- whoo CONS! -} | ||
-- also this recursive step should do some uncrossing of before and after, right? | ||
|
||
{- The idea is that you start at the leftmost kid of a Node. | ||
You take as much as is continuous and you cons that onto the rest of the siblings+disco | ||
after that has all been uncrossed. | ||
co : uncross (insert disco siblings) | ||
except that uncross has to take additional arguments? | ||
also some uncrossings may have to burrow arbitrarily deep. I'm not sure of the limit yet. | ||
Anyway you'd have to do it either way, bottom-up or top-down, so at least top-down it's | ||
easier to maintain pointers to it all even if there is a lot of consing involved. | ||
actually now that I sleep on it, I'm not sure about arbitrary deepness. I think that's | ||
needed only if you want to try Wolfgang's bottom-up style. Adriane Boyd's split style | ||
just attaches the disco part as a sibling to the co part I think. | ||
-} | ||
T |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import Distance hiding (main) | ||
import Test.QuickCheck.Batch | ||
import Test.QuickCheck | ||
import Control.Arrow ((&&&)) | ||
import Util | ||
import Data.List (group, sort, nub) | ||
import Data.List.Split (splitOn) | ||
import qualified Data.Map as Map | ||
{- instance Arbitrary Char where | ||
arbitrary = choose ('\32', '\128') | ||
coarbitrary c = variant (ord c `rem` 4) -} | ||
|
||
exampleLines = splitOn "\n" "Morgoth\na\nb\n***\na\nc" | ||
|
||
prop_histogram_list :: [Int] -> Bool -- oops, this is a test for Util. | ||
prop_histogram_list l = Map.toList (histogram l) == listhist l | ||
where listhist = sort & group & map (head &&& length) | ||
prop_histogram_empty :: [Int] -> Bool | ||
prop_histogram_empty l = (l == []) == (histogram l == Map.empty) | ||
|
||
prop_r_empty :: Bool | ||
prop_r_empty = r [] == 0.0 | ||
prop_r_one = r [(1,2)] == 1.0 | ||
prop_r_two = r [(1,2), (10,20)] == 11.0 | ||
|
||
prop_cmp_truncate :: [Int] -> [Int] -> Property | ||
prop_cmp_truncate r1 r2 = (r2 /= [] && r1 /= []) ==> | ||
length (cmp r1 r2) == length (nub r2) | ||
prop_cmp_zeroes_r1 :: [Int] -> [Int] -> Property | ||
prop_cmp_zeroes_r1 r1 r2 = (r2 /= [] && r1 /= []) ==> | ||
countBy (/=0.0) (map fst rcompare) | ||
== Map.size (histogram r1 `Map.intersection` histogram r2) | ||
where rcompare = cmp r1 r2 | ||
prop_cmp_iterate :: [Int] -> [Int] -> Property | ||
prop_cmp_iterate r1 r2 = (r1 /= [] && r2 /= []) ==> all (\ ((a,b),(c,d)) -> abs (a - b) < abs (c - d)) $ zip (cmp r1 r2) $ map (\ (f,n) -> (fromIntegral $ Map.findWithDefault 0 f (histogram r1), fromIntegral n)) (Map.toList $ histogram r2) | ||
where rcompare = cmp r1 r2 | ||
main = runTests | ||
"The Basics" | ||
TestOptions {no_of_tests = 100 | ||
,length_of_tests = 1 | ||
, debug_tests = False} | ||
[run prop_histogram_list | ||
, run prop_histogram_empty | ||
, run prop_r_empty | ||
, run prop_r_one | ||
, run prop_r_two | ||
, run prop_cmp_truncate | ||
, run prop_cmp_zeroes_r1 | ||
, run prop_cmp_iterate] |
Oops, something went wrong.