Skip to content

Commit

Permalink
feat: Implement encoding in rules file
Browse files Browse the repository at this point in the history
Instead of `text-icu` as [recommended by `text`](https://hackage.haskell.org/package/text-2.1.2/docs/Data-Text-Encoding.html):

> To gain access to a much larger family of encodings, use the `text-icu` package.

we use `encoding`, since `text-icu` requires an external library.
`encoding` does require a custom setup, which is currently
affected by a [bug in Cabal](haskell/cabal#6505)
but it doesn't require an external runtime dependency, which makes it
easier to distribute on macos and windows.

I also changed the `rReadFn` to get a `Handle` rather than a `Text`.
Such that we can use non standard functions for reading it.
This is kind of sideways, since the one place we care about it,
we don't use it at all. Though it still caused issues, since the input
wasn't parseable as text.
I think not using the handle has something to do with how `-` is treated
as stdin but I didn't do any further tests on that, since this is out of
the scope of this PR.
  • Loading branch information
jokesper committed Feb 1, 2025
1 parent 0635fb9 commit 0512800
Show file tree
Hide file tree
Showing 16 changed files with 104 additions and 48 deletions.
26 changes: 16 additions & 10 deletions hledger-lib/Hledger/Read.hs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ module Hledger.Read (

-- * Easy journal parsing
readJournal',
readJournal'',
readJournalFile',
readJournalFiles',
orDieTrying,
Expand All @@ -125,7 +126,7 @@ module Hledger.Read (

--- ** imports
import qualified Control.Exception as C
import Control.Monad (unless, when, forM)
import Control.Monad (unless, when, forM, (<=<))
import "mtl" Control.Monad.Except (ExceptT(..), runExceptT, liftEither)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Default (def)
Expand All @@ -145,7 +146,7 @@ import System.Environment (getEnv)
import System.Exit (exitFailure)
import System.FilePath ((<.>), (</>), splitDirectories, splitFileName, takeFileName)
import System.Info (os)
import System.IO (hPutStr, stderr)
import System.IO (Handle, hPutStr, stderr)

import Hledger.Data.Dates (getCurrentDay, parsedateM, showDate)
import Hledger.Data.Types
Expand Down Expand Up @@ -205,7 +206,7 @@ type PrefixedFilePath = FilePath

-- | @readJournal iopts mfile txt@
--
-- Read a Journal from some text, with strict checks if enabled,
-- Read a Journal from some handle, with strict checks if enabled,
-- or return an error message.
--
-- The reader (data format) is chosen based on, in this order:
Expand All @@ -219,11 +220,11 @@ type PrefixedFilePath = FilePath
-- If none of these is available, or if the reader name is unrecognised,
-- we use the journal reader (for predictability).
--
readJournal :: InputOpts -> Maybe FilePath -> Text -> ExceptT String IO Journal
readJournal iopts@InputOpts{strict_, _defer} mpath txt = do
readJournal :: InputOpts -> Maybe FilePath -> Handle -> ExceptT String IO Journal
readJournal iopts@InputOpts{strict_, _defer} mpath hdl = do
let r :: Reader IO = fromMaybe JournalReader.reader $ findReader (mformat_ iopts) mpath
dbg6IO "readJournal: trying reader" (rFormat r)
j <- rReadFn r iopts (fromMaybe "(string)" mpath) txt
j <- rReadFn r iopts (fromMaybe "(string)" mpath) hdl
when (strict_ && not _defer) $ liftEither $ journalStrictChecks j
return j

Expand Down Expand Up @@ -264,11 +265,11 @@ readJournalFileAndLatestDates iopts prefixedfile = do
(mfmt, f) = splitReaderPrefix prefixedfile
iopts' = iopts{mformat_=asum [mfmt, mformat_ iopts]}
liftIO $ requireJournalFileExists f
t <-
h <-
traceOrLogAt 6 ("readJournalFile: "++takeFileName f) $
liftIO $ readFileOrStdinPortably f
liftIO $ openFileOrStdin f
-- <- T.readFile f -- or without line ending translation, for testing
j <- readJournal iopts' (Just f) t
j <- readJournal iopts' (Just f) h
if new_ iopts
then do
ds <- liftIO $ previousLatestDates f
Expand Down Expand Up @@ -313,9 +314,14 @@ readJournalFilesAndLatestDates iopts pfs = do

-- | An easy version of 'readJournal' which assumes default options, and fails
-- in the IO monad.
readJournal' :: Text -> IO Journal
readJournal' :: Handle -> IO Journal
readJournal' = orDieTrying . readJournal definputopts Nothing

-- | An even easier version of 'readJournal' which additionally to 'readJournal''
-- also takes a 'Text' instead of a 'Handle'.
readJournal'' :: Text -> IO Journal
readJournal'' = readJournal' <=< inputToHandle

-- | An easy version of 'readJournalFile' which assumes default options, and fails
-- in the IO monad.
readJournalFile' :: PrefixedFilePath -> IO Journal
Expand Down
10 changes: 8 additions & 2 deletions hledger-lib/Hledger/Read/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ module Hledger.Read.Common (
HasInputOpts(..),
definputopts,
rawOptsToInputOpts,
handleReadFnToTextReadFn,

-- * parsing utilities
parseAndFinaliseJournal,
Expand Down Expand Up @@ -148,6 +149,7 @@ import Data.Time.Clock.POSIX (getPOSIXTime)
import Data.Time.LocalTime (LocalTime(..), TimeOfDay(..))
import Data.Word (Word8)
import System.FilePath (takeFileName)
import System.IO (Handle)
import Text.Megaparsec
import Text.Megaparsec.Char (char, char', digitChar, newline, string)
import Text.Megaparsec.Char.Lexer (decimal)
Expand Down Expand Up @@ -179,9 +181,9 @@ data Reader m = Reader {
,rExtensions :: [String]

-- The entry point for reading this format, accepting input options, file
-- path for error messages and file contents, producing an exception-raising IO
-- path for error messages and file contents via the handle, producing an exception-raising IO
-- action that produces a journal or error message.
,rReadFn :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
,rReadFn :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal

-- The actual megaparsec parser called by the above, in case
-- another parser (includedirectivep) wants to use it directly.
Expand Down Expand Up @@ -231,6 +233,10 @@ rawOptsToInputOpts day usecoloronstdout postingaccttags rawopts =
,_ioDay = day
}

handleReadFnToTextReadFn :: (InputOpts -> FilePath -> Text -> ExceptT String IO Journal) -> InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
handleReadFnToTextReadFn p iopts fp =
p iopts fp <=< lift . readHandlePortably

-- | Get the date span from --forecast's PERIODEXPR argument, if any.
-- This will fail with a usage error if the period expression cannot be parsed,
-- or if it contains a report interval.
Expand Down
10 changes: 5 additions & 5 deletions hledger-lib/Hledger/Read/CsvReader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ where
import Prelude hiding (Applicative(..))
import Control.Monad.Except (ExceptT(..), liftEither)
import Control.Monad.IO.Class (MonadIO)
import Data.Text (Text)
import System.IO (Handle)

import Hledger.Data
import Hledger.Utils
Expand All @@ -54,18 +54,18 @@ reader sep = Reader
-- This file path is normally the CSV(/SSV/TSV) data file, and a corresponding rules file is inferred.
-- But it can also be the rules file, in which case the corresponding data file is inferred.
-- This does not check balance assertions.
parse :: SepFormat -> InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse sep iopts f t = do
parse :: SepFormat -> InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
parse sep iopts f h = do
let mrulesfile = mrules_file_ iopts
readJournalFromCsv (Right <$> mrulesfile) f t (Just sep)
readJournalFromCsv (Right <$> mrulesfile) f h (Just sep)
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
-- reversed, as produced by JournalReader's parser.
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
. journalReverse
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f t
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""

--- ** tests

Expand Down
2 changes: 1 addition & 1 deletion hledger-lib/Hledger/Read/JournalReader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Journal'
,rExtensions = ["journal", "j", "hledger", "ledger"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = journalp -- no need to add command line aliases like journalp'
-- when called as a subparser I think
}
Expand Down
30 changes: 22 additions & 8 deletions hledger-lib/Hledger/Read/RulesReader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
import Control.Monad.Trans.Class (lift)
import Data.Char (toLower, isDigit, isSpace, isAlphaNum, ord)
import Data.Bifunctor (first)
import Data.Encoding (encodingFromStringExplicit)
import Data.Functor ((<&>))
import Data.List (elemIndex, mapAccumL, nub, sortOn)
#if !MIN_VERSION_base(4,20,0)
Expand All @@ -69,6 +70,7 @@ import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
import Safe (atMay, headMay, lastMay, readMay)
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
import System.IO (Handle, hClose)
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import qualified Data.ByteString as B
Expand Down Expand Up @@ -116,10 +118,11 @@ getDownloadDir = do
-- file's directory. When a glob pattern matches multiple files, the alphabetically
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
-- will be used.)
-- The provided text, or a --rules option, are ignored by this reader.
-- The provided handle, or a --rules option, are ignored by this reader.
-- Balance assertions are not checked.
parse :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse iopts f _ = do
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
parse iopts f h = do
lift $ hClose h -- We don't need it
rules <- readRulesFile $ dbg4 "reading rules file" f
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
mdatafile <- liftIO $ do
Expand All @@ -139,8 +142,8 @@ parse iopts f _ = do
if not (dat=="-" || exists)
then return nulljournal -- data file inferred from rules file name was not found
else do
t <- liftIO $ readFileOrStdinPortably dat
readJournalFromCsv (Just $ Left rules) dat t Nothing
dath <- liftIO $ openFileOrStdin dat
readJournalFromCsv (Just $ Left rules) dat dath Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
Expand Down Expand Up @@ -500,6 +503,7 @@ directivep = (do
directives :: [Text]
directives =
["source"
,"encoding"
,"date-format"
,"decimal-mark"
,"separator"
Expand Down Expand Up @@ -908,9 +912,9 @@ _CSV_READING__________________________________________ = undefined
--
-- 4. Return the transactions as a Journal.
--
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Text -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" _ _ = throwError "please use --rules when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvtext sep = do
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Handle -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" h _ = lift (hClose h) *> throwError "please use --rules when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvhandle sep = do
-- for now, correctness is the priority here, efficiency not so much

rules <- case merulesfile of
Expand All @@ -919,6 +923,16 @@ readJournalFromCsv merulesfile csvfile csvtext sep = do
Nothing -> readRulesFile $ rulesFileFor csvfile
dbg6IO "csv rules" rules

-- read csv while being aware of the encoding
mencoding <- do
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
case T.unpack <$> getDirective "encoding" rules of
Just rawenc -> case encodingFromStringExplicit $ dbg4 "raw-encoding" rawenc of
Just enc -> return . Just $ dbg4 "encoding" enc
Nothing -> throwError $ "Invalid encoding: " <> rawenc
Nothing -> return Nothing
csvtext <- lift $ readHandlePortably' mencoding csvhandle

-- convert the csv data to lines and remove all empty/blank lines
let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext

Expand Down
2 changes: 1 addition & 1 deletion hledger-lib/Hledger/Read/TimeclockReader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Timeclock
,rExtensions = ["timeclock"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = timeclockfilep
}

Expand Down
2 changes: 1 addition & 1 deletion hledger-lib/Hledger/Read/TimedotReader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Timedot
,rExtensions = ["timedot"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = timedotp
}

Expand Down
6 changes: 3 additions & 3 deletions hledger-lib/Hledger/Reports/PostingsReport.hs
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report with cleared option" ~:
do
let opts = defreportopts{cleared_=True}
j <- readJournal' sample_journal_str
j <- readJournal'' sample_journal_str
(postingsReportAsText opts $ postingsReport opts (queryFromOpts date1 opts) j) `is` unlines
["2008/06/03 eat & shop expenses:food $1 $1"
," expenses:supplies $1 $2"
Expand All @@ -313,7 +313,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report with uncleared option" ~:
do
let opts = defreportopts{uncleared_=True}
j <- readJournal' sample_journal_str
j <- readJournal'' sample_journal_str
(postingsReportAsText opts $ postingsReport opts (queryFromOpts date1 opts) j) `is` unlines
["2008/01/01 income assets:bank:checking $1 $1"
," income:salary $-1 0"
Expand All @@ -325,7 +325,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report sorts by date" ~:
do
j <- readJournal' $ unlines
j <- readJournal'' $ unlines
["2008/02/02 a"
," b 1"
," c"
Expand Down
42 changes: 35 additions & 7 deletions hledger-lib/Hledger/Utils/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ pretty-printing haskell values, error reporting, time, files, command line parsi
terminals, pager output, ANSI colour/styles, etc.
-}

{-# LANGUAGE ImplicitParams #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}

Expand All @@ -31,11 +33,15 @@ module Hledger.Utils.IO (
expandPath,
expandGlob,
sortByModTime,
openFileOrStdin,
readFileOrStdinPortably,
readFileOrStdinPortably',
readFileStrictly,
readFilePortably,
readHandlePortably,
readHandlePortably',
-- hereFileRelative,
inputToHandle,

-- * Command line parsing
progArgs,
Expand Down Expand Up @@ -111,6 +117,7 @@ import Data.Char (toLower)
import Data.Colour.RGBSpace (RGB(RGB))
import Data.Colour.RGBSpace.HSL (lightness)
import Data.Colour.SRGB (sRGB)
import Data.Encoding (DynEncoding)
import Data.FileEmbed (makeRelativeToProject, embedStringFile)
import Data.Functor ((<&>))
import Data.List hiding (uncons)
Expand All @@ -136,8 +143,9 @@ import System.FilePath (isRelative, (</>))
import "Glob" System.FilePath.Glob (glob)
import System.Info (os)
import System.IO (Handle, IOMode (..), hGetEncoding, hSetEncoding, hSetNewlineMode, openFile, stdin, stdout, stderr, universalNewlineMode, utf8_bom, hIsTerminalDevice, hPutStr, hClose)
import qualified System.IO.Encoding as Enc
import System.IO.Unsafe (unsafePerformIO)
import System.Process (CreateProcess(..), StdStream(CreatePipe), shell, waitForProcess, withCreateProcess)
import System.Process (CreateProcess(..), StdStream(CreatePipe), createPipe, shell, waitForProcess, withCreateProcess)
import Text.Pretty.Simple (CheckColorTty(..), OutputOptions(..), defaultOutputOptionsDarkBg, defaultOutputOptionsNoColor, pShowOpt, pPrintOpt)

import Hledger.Utils.Text (WideBuilder(WideBuilder))
Expand Down Expand Up @@ -280,19 +288,39 @@ readFilePortably f = openFile f ReadMode >>= readHandlePortably

-- | Like readFilePortably, but read from standard input if the path is "-".
readFileOrStdinPortably :: String -> IO T.Text
readFileOrStdinPortably f = openFileOrStdin f ReadMode >>= readHandlePortably
where
openFileOrStdin :: String -> IOMode -> IO Handle
openFileOrStdin "-" _ = return stdin
openFileOrStdin f' m = openFile f' m
readFileOrStdinPortably = readFileOrStdinPortably' Nothing

-- | Like readFileOrStdinPortably, but take an optional converter.
readFileOrStdinPortably' :: Maybe DynEncoding -> String -> IO T.Text
readFileOrStdinPortably' c f = openFileOrStdin f >>= readHandlePortably' c

openFileOrStdin :: String -> IO Handle
openFileOrStdin "-" = return stdin
openFileOrStdin f' = openFile f' ReadMode

readHandlePortably :: Handle -> IO T.Text
readHandlePortably h = do
readHandlePortably = readHandlePortably' Nothing

readHandlePortably' :: Maybe DynEncoding -> Handle -> IO T.Text
readHandlePortably' Nothing h = do
hSetNewlineMode h universalNewlineMode
menc <- hGetEncoding h
when (fmap show menc == Just "UTF-8") $ -- XXX no Eq instance, rely on Show
hSetEncoding h utf8_bom
T.hGetContents h
readHandlePortably' (Just e) h =
-- We need to manually apply the newline mode
-- Since we already have a Text
T.replace "\r\n" "\n" . T.pack <$> let ?enc = e in Enc.hGetContents h

inputToHandle :: T.Text -> IO Handle
inputToHandle t = do
(r, w) <- createPipe
hSetEncoding r utf8_bom
hSetEncoding w utf8_bom
T.hPutStr w t
hClose w
return r

-- | Like embedFile, but takes a path relative to the package directory.
embedFileRelative :: FilePath -> Q Exp
Expand Down
1 change: 1 addition & 0 deletions hledger-lib/hledger-lib.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ library
, deepseq
, directory >=1.2.6.1
, doclayout >=0.3 && <0.6
, encoding >=0.8.10
, extra >=1.6.3
, file-embed >=0.0.10
, filepath
Expand Down
1 change: 1 addition & 0 deletions hledger-lib/package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ dependencies:
- Decimal >=0.5.1
- directory >=1.2.6.1
- doclayout >=0.3 && <0.6
- encoding >=0.8.10
- file-embed >=0.0.10
- filepath
- hashtables >=1.2.3.1
Expand Down
Loading

0 comments on commit 0512800

Please sign in to comment.