Skip to content

Commit

Permalink
Delimted text parsing and improved parsing of consecutive separators
Browse files Browse the repository at this point in the history
  • Loading branch information
jwoLondon committed Jan 22, 2019
1 parent dd88062 commit e68bf67
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 143 deletions.
2 changes: 1 addition & 1 deletion docs/api/docs.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/js/wickhamExamples.min.js

Large diffs are not rendered by default.

31 changes: 19 additions & 12 deletions examples/src/WickhamExamples.elm
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ view model =
]


table1TSV =
"""Treatment\tJohn Smith\tJane Doe\tMary Johnson
treatmenta\t\t16\t3
treatmentb\t2\t11\t1"""
|> Tidy.fromDelimited '\t'


messy1 : Table
messy1 =
"""Person,treatmenta,treatmentb
Expand Down Expand Up @@ -121,10 +128,10 @@ messy7 =
"""
year,artist,track,time, date.entered,wk1,wk2,wk3,wk4,wk5,wk6,wk7
2000,2 Pac,Baby Don't Cry,4:22, 2000-02-26,87,82,72,77,87,94,99
2000,2Ge+her,The Hardest Part Of...,3:15,2000-09-02,91,87,92,"","","",""
2000,2Ge+her,The Hardest Part Of...,3:15,2000-09-02,91,87,92,,,,
2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70,68,67,66,57,54
2000,98^0,Give Me Just One Night,3:24,2000-08-19,51,39,34,26,26,19,2
2000,A*Teens,Dancing Queen,3:44, 2000-07-08,97,97,96,95,100,"",""
2000,A*Teens,Dancing Queen,3:44, 2000-07-08,97,97,96,95,100,,
2000,Aaliyah,I Don't Wanna,4:15, 2000-01-29,84,62,51,41,38,35,35
2000,Aaliyah,Try Again,4:03, 2000-03-18,59,53,38,28,21,18,16
2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,76,76,74,69,68,67,61
Expand Down Expand Up @@ -183,16 +190,16 @@ tidy8 =
messy9 : Table
messy9 =
"""country,year,m014,m1524,m2534,m3544,m4554,m5564,m65,mu,f014,f1524,f2534,f3544,f4554,f5564,f65,fu
AD,2000,0,0,1,0,0,0,0,"","","","","","","","",""
AE,2000,2,4,4,6,5,12,10,"",3,16,1,3,0,0,4,""
AF,2000,52,228,183,149,129,94,80,"",93,414,565,339,205,99,36,""
AG,2000,0,0,0,0,0,0,1,"",1,1,1,0,0,0,0,""
AL,2000,2,19,21,14,24,19,16,"",3,11,10,8,8,5,11,""
AM,2000,2,152,130,131,63,26,21,"",1,24,27,24,8,8,4,""
AN,2000,0,0,1,2,0,0,0,"",0,0,1,0,0,1,0,""
AO,2000,186,999,1003,912,482,312,194,"",247,1142,1091,844,417,200,120,""
AR,2000,97,278,594,402,419,368,330,"",121,544,479,262,230,179,216,""
AS,2000,"","","","",1,1,"","","","","","",1,"","",""
AD,2000,0,0,1,0,0,0,0,,,,,,,,,
AE,2000,2,4,4,6,5,12,10,,3,16,1,3,0,0,4,
AF,2000,52,228,183,149,129,94,80,,93,414,565,339,205,99,36,
AG,2000,0,0,0,0,0,0,1,,1,1,1,0,0,0,0,
AL,2000,2,19,21,14,24,19,16,,3,11,10,8,8,5,11,
AM,2000,2,152,130,131,63,26,21,,1,24,27,24,8,8,4,
AN,2000,0,0,1,2,0,0,0,,0,0,1,0,0,1,0,
AO,2000,186,999,1003,912,482,312,194,,247,1142,1091,844,417,200,120,
AR,2000,97,278,594,402,419,368,330,,121,544,479,262,230,179,216,
AS,2000,,,,,1,1,,,,,,,1,,,
""" |> fromCSV


Expand Down
233 changes: 115 additions & 118 deletions src/CSVParser.elm
Original file line number Diff line number Diff line change
@@ -1,151 +1,148 @@
module CSVParser exposing (parse)
module CSVParser exposing (parse, parseDelimited)

{-| Adapted from [Brian Hicks' example](https://gist.github.com/BrianHicks/165554b033eb797e3ed851964ecb3a38)
{-| Adapted from <https://github.com/lovasoa/elm-csv>
-}

import Parser exposing ((|.), (|=), Parser)


parse : String -> List (List String)
parse input =
case parseWithSeparators defaultSeparators input of
Ok (CSV Plain items) ->
items
|> List.filter (not << List.isEmpty)
|> List.map (List.map String.trim)
parse =
parseWith ","
>> mergeWithHeaders


_ ->
[]
parseDelimited : Char -> String -> List (List String)
parseDelimited delimiter =
parseWith (String.fromChar delimiter)
>> mergeWithHeaders



----------------------------------------------------------------------- Private


type alias Row =
List String
type alias Csv =
{ headers : List String
, records : List (List String)
}


mergeWithHeaders : Csv -> List (List String)
mergeWithHeaders csv =
.headers csv :: .records csv

type Plain
= Plain

parseWith : String -> String -> Csv
parseWith separator lines =
let
values =
splitWith separator lines

headers =
List.head values
|> Maybe.withDefault []

type WithNamedFields
= WithNamedFields Row
| EmptyHeaders
records =
List.drop 1 values
in
{ headers = headers
, records = records
}


type CSV a
= CSV a (List Row)
split : String -> List (List String)
split =
splitWith ","


type alias Separators =
{ value : Char }
splitWith : String -> String -> List (List String)
splitWith separator lines =
let
values =
String.lines lines
|> List.filter (\x -> not (String.isEmpty x))
in
List.map (splitLineWith separator) values


defaultSeparators : Separators
defaultSeparators =
{ value = ',' }
splitLine : String -> List String
splitLine =
splitLineWith ","


parseWithSeparators : Separators -> String -> Result (List Parser.DeadEnd) (CSV Plain)
parseWithSeparators separators raw =
Parser.run (rows separators) raw
splitLineWith : String -> String -> List String
splitLineWith separator line =
parseRemaining separator False line []
|> List.reverse


rows : Separators -> Parser (CSV Plain)
rows separators =
Parser.map (CSV Plain) (Parser.loop [] (rowsHelp separators))
parseRemaining : String -> Bool -> String -> List String -> List String
parseRemaining separator quoted remaining done =
if remaining == "" then
done

else if separator /= "" && not quoted && String.startsWith separator remaining then
let
newQuoted =
False

rowsHelp : Separators -> List Row -> Parser (Parser.Step (List Row) (List Row))
rowsHelp separators revRows =
Parser.oneOf
[ Parser.end
|> Parser.map (\_ -> Parser.Done (List.reverse revRows))
, row separators
|> Parser.map (\newRow -> Parser.Loop (newRow :: revRows))
]
nextChars =
String.dropLeft (String.length separator) remaining
in
parseRemaining separator False nextChars ("" :: done)

else
let
current =
List.head done |> Maybe.withDefault ""

row : Separators -> Parser Row
row separators =
Parser.loop [] (rowHelp separators)
others =
List.tail done |> Maybe.withDefault []

nextChar =
String.slice 0 1 remaining

rowHelp : Separators -> Row -> Parser (Parser.Step Row Row)
rowHelp separators revVals =
let
doneWhen : Parser a -> Parser (Parser.Step Row Row)
doneWhen =
Parser.map (\_ -> Parser.Done (List.reverse revVals))
nextNextChar =
String.slice 1 2 remaining

nextWhen : Parser String -> Parser (Parser.Step Row Row)
nextWhen =
Parser.map (\newVal -> Parser.Loop (newVal :: revVals))
in
Parser.oneOf
[ doneWhen Parser.end
, doneWhen (Parser.token "\n")
, Parser.token (String.fromChar separators.value) |> skipTo revVals
, nextWhen quotedValue

-- TODO: token for \r\n after updating elm-format. It automatically
-- formats to the wrong/old syntax for specifying codepoints in the
-- version I have installed ATM
, Parser.chompWhile (\c -> c /= '\n' && c /= separators.value)
|> Parser.getChompedString
|> nextWhen
]


quotedValue : Parser String
quotedValue =
Parser.succeed identity
|. Parser.token "\""
|= Parser.loop "" quotedValueHelp
|> Parser.andThen
(\final ->
case final of
Ok good ->
Parser.succeed good

Err err ->
Parser.problem err
)


quotedValueHelp : String -> Parser (Parser.Step String (Result String String))
quotedValueHelp soFar =
let
subAndLoop : String -> Parser a -> Parser (Parser.Step String b)
subAndLoop alt parser =
parser
|> Parser.map (\_ -> Parser.Loop (soFar ++ alt))
in
Parser.oneOf
[ Parser.end |> Parser.map (\_ -> Parser.Done (Err "I reached the end of the input while trying to parse a quoted string."))
, Parser.token "\"\"" |> subAndLoop "\""
, Parser.token "\\\"" |> subAndLoop "\""
, Parser.token "\\" |> skipTo soFar
, Parser.token "\""
|> Parser.map (\_ -> Parser.Done (Ok soFar))
, Parser.chompWhile (\c -> c /= '\\' && c /= '"')
|> Parser.getChompedString
|> Parser.map (\newPortion -> Parser.Loop (soFar ++ newPortion))
]


skipTo : b -> Parser a -> Parser (Parser.Step b c)
skipTo soFar =
Parser.map (\_ -> Parser.Loop soFar)


firstRowAreNames : CSV Plain -> CSV WithNamedFields
firstRowAreNames (CSV _ rowsAndHeader) =
case rowsAndHeader of
head :: body ->
CSV (WithNamedFields head) body

[] ->
CSV EmptyHeaders rowsAndHeader
startQuote =
nextChar == "\"" && nextNextChar /= "\"" && current == ""

doubleQuote =
nextChar == "\"" && nextNextChar == "\""

isEscapedQuote =
not quoted && (nextChar == "\\" || nextChar == "\"") && nextNextChar == "\""

endQuote =
quoted && nextChar == "\"" && not isEscapedQuote

newQuoted =
(quoted && not endQuote) || startQuote

nextChars =
String.dropLeft
(if isEscapedQuote || doubleQuote then
2

else
1
)
remaining

newChar =
if doubleQuote then
""

else if isEscapedQuote then
"\""

else if startQuote || endQuote then
""

else
nextChar

newDone =
(current ++ newChar) :: others
in
parseRemaining separator newQuoted nextChars newDone
34 changes: 27 additions & 7 deletions src/Tidy.elm
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Tidy exposing
( Table
, fromCSV
, fromDelimited
, fromGrid
, fromGridRows
, empty
Expand Down Expand Up @@ -36,6 +37,7 @@ module Tidy exposing
# Create
@docs fromCSV
@docs fromDelimited
@docs fromGrid
@docs fromGridRows
@docs empty
Expand Down Expand Up @@ -150,17 +152,35 @@ empty =
toTable Dict.empty


{-| Create a table from a multi-line comma-separated string in the form:
{-| Create a table from a multi-line comma-separated string. For example
"""colLabelA,colLabelB,colLabelC,etc.
a1,b1,c1, etc.
a2,b2,c2, etc.
a3,b3,c3, etc.
etc."""
myTable =
"""colA,colB,colC
a1,b1,c1
a2,b2,c2
a3,b3,c3"""
|> fromCSV
-}
fromCSV : String -> Table
fromCSV =
fromDelimited ','


{-| Create a table from a multi-line string where values are separated by the
given delimiter (first parameter). For example, to process a tab-delimited values
file (TSV):
myTable =
"""colA colB colC
a1 b1 c1
a2 b2 c2
a3 b3 c3"""
|> fromDelimited '\t'
-}
fromDelimited : Char -> String -> Table
fromDelimited delimiter =
let
addEntry xs =
case xs of
Expand All @@ -170,7 +190,7 @@ fromCSV =
_ ->
identity
in
CSVParser.parse
CSVParser.parseDelimited delimiter
>> transpose
>> List.indexedMap Tuple.pair
>> List.foldl addEntry Dict.empty
Expand Down
Loading

0 comments on commit e68bf67

Please sign in to comment.