-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Delimted text parsing and improved parsing of consecutive separators
- Loading branch information
Showing
6 changed files
with
200 additions
and
143 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,151 +1,148 @@ | ||
module CSVParser exposing (parse) | ||
module CSVParser exposing (parse, parseDelimited) | ||
|
||
{-| Adapted from [Brian Hicks' example](https://gist.github.com/BrianHicks/165554b033eb797e3ed851964ecb3a38) | ||
{-| Adapted from <https://github.com/lovasoa/elm-csv> | ||
-} | ||
|
||
import Parser exposing ((|.), (|=), Parser) | ||
|
||
|
||
parse : String -> List (List String) | ||
parse input = | ||
case parseWithSeparators defaultSeparators input of | ||
Ok (CSV Plain items) -> | ||
items | ||
|> List.filter (not << List.isEmpty) | ||
|> List.map (List.map String.trim) | ||
parse = | ||
parseWith "," | ||
>> mergeWithHeaders | ||
|
||
|
||
_ -> | ||
[] | ||
parseDelimited : Char -> String -> List (List String) | ||
parseDelimited delimiter = | ||
parseWith (String.fromChar delimiter) | ||
>> mergeWithHeaders | ||
|
||
|
||
|
||
----------------------------------------------------------------------- Private | ||
|
||
|
||
type alias Row = | ||
List String | ||
type alias Csv = | ||
{ headers : List String | ||
, records : List (List String) | ||
} | ||
|
||
|
||
mergeWithHeaders : Csv -> List (List String) | ||
mergeWithHeaders csv = | ||
.headers csv :: .records csv | ||
|
||
type Plain | ||
= Plain | ||
|
||
parseWith : String -> String -> Csv | ||
parseWith separator lines = | ||
let | ||
values = | ||
splitWith separator lines | ||
|
||
headers = | ||
List.head values | ||
|> Maybe.withDefault [] | ||
|
||
type WithNamedFields | ||
= WithNamedFields Row | ||
| EmptyHeaders | ||
records = | ||
List.drop 1 values | ||
in | ||
{ headers = headers | ||
, records = records | ||
} | ||
|
||
|
||
type CSV a | ||
= CSV a (List Row) | ||
split : String -> List (List String) | ||
split = | ||
splitWith "," | ||
|
||
|
||
type alias Separators = | ||
{ value : Char } | ||
splitWith : String -> String -> List (List String) | ||
splitWith separator lines = | ||
let | ||
values = | ||
String.lines lines | ||
|> List.filter (\x -> not (String.isEmpty x)) | ||
in | ||
List.map (splitLineWith separator) values | ||
|
||
|
||
defaultSeparators : Separators | ||
defaultSeparators = | ||
{ value = ',' } | ||
splitLine : String -> List String | ||
splitLine = | ||
splitLineWith "," | ||
|
||
|
||
parseWithSeparators : Separators -> String -> Result (List Parser.DeadEnd) (CSV Plain) | ||
parseWithSeparators separators raw = | ||
Parser.run (rows separators) raw | ||
splitLineWith : String -> String -> List String | ||
splitLineWith separator line = | ||
parseRemaining separator False line [] | ||
|> List.reverse | ||
|
||
|
||
rows : Separators -> Parser (CSV Plain) | ||
rows separators = | ||
Parser.map (CSV Plain) (Parser.loop [] (rowsHelp separators)) | ||
parseRemaining : String -> Bool -> String -> List String -> List String | ||
parseRemaining separator quoted remaining done = | ||
if remaining == "" then | ||
done | ||
|
||
else if separator /= "" && not quoted && String.startsWith separator remaining then | ||
let | ||
newQuoted = | ||
False | ||
|
||
rowsHelp : Separators -> List Row -> Parser (Parser.Step (List Row) (List Row)) | ||
rowsHelp separators revRows = | ||
Parser.oneOf | ||
[ Parser.end | ||
|> Parser.map (\_ -> Parser.Done (List.reverse revRows)) | ||
, row separators | ||
|> Parser.map (\newRow -> Parser.Loop (newRow :: revRows)) | ||
] | ||
nextChars = | ||
String.dropLeft (String.length separator) remaining | ||
in | ||
parseRemaining separator False nextChars ("" :: done) | ||
|
||
else | ||
let | ||
current = | ||
List.head done |> Maybe.withDefault "" | ||
|
||
row : Separators -> Parser Row | ||
row separators = | ||
Parser.loop [] (rowHelp separators) | ||
others = | ||
List.tail done |> Maybe.withDefault [] | ||
|
||
nextChar = | ||
String.slice 0 1 remaining | ||
|
||
rowHelp : Separators -> Row -> Parser (Parser.Step Row Row) | ||
rowHelp separators revVals = | ||
let | ||
doneWhen : Parser a -> Parser (Parser.Step Row Row) | ||
doneWhen = | ||
Parser.map (\_ -> Parser.Done (List.reverse revVals)) | ||
nextNextChar = | ||
String.slice 1 2 remaining | ||
|
||
nextWhen : Parser String -> Parser (Parser.Step Row Row) | ||
nextWhen = | ||
Parser.map (\newVal -> Parser.Loop (newVal :: revVals)) | ||
in | ||
Parser.oneOf | ||
[ doneWhen Parser.end | ||
, doneWhen (Parser.token "\n") | ||
, Parser.token (String.fromChar separators.value) |> skipTo revVals | ||
, nextWhen quotedValue | ||
|
||
-- TODO: token for \r\n after updating elm-format. It automatically | ||
-- formats to the wrong/old syntax for specifying codepoints in the | ||
-- version I have installed ATM | ||
, Parser.chompWhile (\c -> c /= '\n' && c /= separators.value) | ||
|> Parser.getChompedString | ||
|> nextWhen | ||
] | ||
|
||
|
||
quotedValue : Parser String | ||
quotedValue = | ||
Parser.succeed identity | ||
|. Parser.token "\"" | ||
|= Parser.loop "" quotedValueHelp | ||
|> Parser.andThen | ||
(\final -> | ||
case final of | ||
Ok good -> | ||
Parser.succeed good | ||
|
||
Err err -> | ||
Parser.problem err | ||
) | ||
|
||
|
||
quotedValueHelp : String -> Parser (Parser.Step String (Result String String)) | ||
quotedValueHelp soFar = | ||
let | ||
subAndLoop : String -> Parser a -> Parser (Parser.Step String b) | ||
subAndLoop alt parser = | ||
parser | ||
|> Parser.map (\_ -> Parser.Loop (soFar ++ alt)) | ||
in | ||
Parser.oneOf | ||
[ Parser.end |> Parser.map (\_ -> Parser.Done (Err "I reached the end of the input while trying to parse a quoted string.")) | ||
, Parser.token "\"\"" |> subAndLoop "\"" | ||
, Parser.token "\\\"" |> subAndLoop "\"" | ||
, Parser.token "\\" |> skipTo soFar | ||
, Parser.token "\"" | ||
|> Parser.map (\_ -> Parser.Done (Ok soFar)) | ||
, Parser.chompWhile (\c -> c /= '\\' && c /= '"') | ||
|> Parser.getChompedString | ||
|> Parser.map (\newPortion -> Parser.Loop (soFar ++ newPortion)) | ||
] | ||
|
||
|
||
skipTo : b -> Parser a -> Parser (Parser.Step b c) | ||
skipTo soFar = | ||
Parser.map (\_ -> Parser.Loop soFar) | ||
|
||
|
||
firstRowAreNames : CSV Plain -> CSV WithNamedFields | ||
firstRowAreNames (CSV _ rowsAndHeader) = | ||
case rowsAndHeader of | ||
head :: body -> | ||
CSV (WithNamedFields head) body | ||
|
||
[] -> | ||
CSV EmptyHeaders rowsAndHeader | ||
startQuote = | ||
nextChar == "\"" && nextNextChar /= "\"" && current == "" | ||
|
||
doubleQuote = | ||
nextChar == "\"" && nextNextChar == "\"" | ||
|
||
isEscapedQuote = | ||
not quoted && (nextChar == "\\" || nextChar == "\"") && nextNextChar == "\"" | ||
|
||
endQuote = | ||
quoted && nextChar == "\"" && not isEscapedQuote | ||
|
||
newQuoted = | ||
(quoted && not endQuote) || startQuote | ||
|
||
nextChars = | ||
String.dropLeft | ||
(if isEscapedQuote || doubleQuote then | ||
2 | ||
|
||
else | ||
1 | ||
) | ||
remaining | ||
|
||
newChar = | ||
if doubleQuote then | ||
"" | ||
|
||
else if isEscapedQuote then | ||
"\"" | ||
|
||
else if startQuote || endQuote then | ||
"" | ||
|
||
else | ||
nextChar | ||
|
||
newDone = | ||
(current ++ newChar) :: others | ||
in | ||
parseRemaining separator newQuoted nextChars newDone |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.