Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Asciiext #15

Merged
merged 3 commits into from
Jul 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions byteslice.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ library
Data.Bytes
Data.Bytes.Chunks
Data.Bytes.Mutable
Data.Bytes.Text.Ascii
Data.Bytes.Text.AsciiExt
Data.Bytes.Text.Latin1
Data.Bytes.Text.Utf8
Data.Bytes.Text.Windows1252
Data.Bytes.Types
other-modules:
Data.Bytes.Byte
Expand Down
260 changes: 67 additions & 193 deletions src/Data/Bytes.hs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/Data/Bytes/Chunks.hs
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ fnv1a64 !b = foldl'
) 0xcbf29ce484222325 b

-- | Outputs 'Chunks' to the specified 'Handle'. This is implemented
-- with 'hPutBuf'.
-- with 'IO.hPut'.
hPut :: Handle -> Chunks -> IO ()
hPut h = go where
go ChunksNil = pure ()
Expand Down
5 changes: 2 additions & 3 deletions src/Data/Bytes/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ import qualified GHC.Exts as Exts
import qualified Data.Primitive as PM

-- | Read 'Bytes' directly from the specified 'Handle'. The resulting
-- 'Bytes' are pinned. This is implemented with 'hGetBuf'.
-- 'Bytes' are pinned. This is implemented with 'IO.hGetBuf'.
hGet :: Handle -> Int -> IO Bytes
hGet h i = createPinnedAndTrim i (\p -> IO.hGetBuf h p i)

-- | Outputs 'Bytes' to the specified 'Handle'. This is implemented
-- with 'hPutBuf'.
-- with 'IO.hPutBuf'.
hPut :: Handle -> Bytes -> IO ()
hPut h b0 = do
let b1@(Bytes arr _ len) = pin b0
Expand All @@ -52,4 +52,3 @@ touchMutableByteArrayIO (PM.MutableByteArray x) =
touchByteArrayIO :: ByteArray -> IO ()
touchByteArrayIO (ByteArray x) =
IO (\s -> (# Exts.touch# x s, () #))

7 changes: 7 additions & 0 deletions src/Data/Bytes/Mutable.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
{-# language BangPatterns #-}
{-# language LambdaCase #-}

-- | If you are interested in sub-arrays of 'MutableByteArray's (e.g. writing
-- quicksort), it would be grossly inefficient to make a copy of the sub-array.
-- On the other hand, it'd be really annoying to track limit indices by hand.
--
-- This module defines the 'MutableBytes' type which exposes a standard array
-- interface for a sub-arrays without copying and without manual index
-- manipulation. For immutable arrays, see 'Data.Bytes'.
module Data.Bytes.Mutable
( -- * Types
MutableBytes
Expand Down
66 changes: 63 additions & 3 deletions src/Data/Bytes/Pure.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,35 @@ module Data.Bytes.Pure
, toPinnedByteArrayClone
, fromByteArray
, length
, foldl
, foldl'
, foldr
, ifoldl'
, foldr'
, fnv1a32
, fnv1a64
, toByteString
, pinnedToByteString
, fromByteString
) where

import Prelude hiding (length)
import Prelude hiding (length,foldl,foldr)

import Control.Monad.Primitive (PrimState,PrimMonad)
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bits (xor)
import Data.ByteString (ByteString)
import Data.Bytes.Types (Bytes(Bytes))
import Data.ByteString (ByteString)
import Data.Primitive (ByteArray,MutableByteArray)
import Data.Word (Word64,Word32,Word8)
import Foreign.Ptr (Ptr,plusPtr)
import GHC.IO (unsafeIOToST)

import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Internal as ByteString
import qualified Data.ByteString.Unsafe as ByteString
import qualified Data.Primitive as PM
import qualified Data.Primitive.Ptr as PM
import qualified GHC.Exts as Exts
import qualified GHC.ForeignPtr as ForeignPtr

Expand Down Expand Up @@ -124,6 +133,16 @@ fnv1a64 !b = foldl'
(\acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3
) 0xcbf29ce484222325 b

-- | Left fold over bytes, non-strict in the accumulator.
foldl :: (a -> Word8 -> a) -> a -> Bytes -> a
{-# inline foldl #-}
foldl f a0 (Bytes arr off0 len0) =
go (off0 + len0 - 1) (len0 - 1)
where
go !off !ix = case ix of
(-1) -> a0
_ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off)

-- | Left fold over bytes, strict in the accumulator.
foldl' :: (a -> Word8 -> a) -> a -> Bytes -> a
{-# inline foldl' #-}
Expand All @@ -132,6 +151,34 @@ foldl' f a0 (Bytes arr off0 len0) = go a0 off0 len0 where
0 -> a
_ -> go (f a (PM.indexByteArray arr off)) (off + 1) (len - 1)

-- | Right fold over bytes, non-strict in the accumulator.
foldr :: (Word8 -> a -> a) -> a -> Bytes -> a
{-# inline foldr #-}
foldr f a0 (Bytes arr off0 len0) = go off0 len0 where
go !off !len = case len of
0 -> a0
_ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1))

-- | Left fold over bytes, strict in the accumulator. The reduction function
-- is applied to each element along with its index.
ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a
{-# inline ifoldl' #-}
ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0 where
go !a !ix !off !len = case len of
0 -> a
_ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1)

-- | Right fold over bytes, strict in the accumulator.
foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a
{-# inline foldr' #-}
foldr' f a0 (Bytes arr off0 len0) =
go a0 (off0 + len0 - 1) (len0 - 1)
where
go !a !off !ix = case ix of
(-1) -> a
_ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1)


-- | Yields a pointer to the beginning of the byte sequence. It is only safe
-- to call this on a 'Bytes' backed by a pinned @ByteArray@.
contents :: Bytes -> Ptr Word8
Expand Down Expand Up @@ -162,7 +209,8 @@ toPinnedByteArrayClone (Bytes arr off len) = runByteArrayST $ do
toByteString :: Bytes -> ByteString
toByteString !b = pinnedToByteString (pin b)

-- | /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise.
-- | Convert a pinned 'Bytes' to a 'ByteString'
-- /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise.
pinnedToByteString :: Bytes -> ByteString
pinnedToByteString (Bytes y@(PM.ByteArray x) off len) =
ByteString.PS
Expand All @@ -171,3 +219,15 @@ pinnedToByteString (Bytes y@(PM.ByteArray x) off len) =
(ForeignPtr.PlainPtr (Exts.unsafeCoerce# x))
)
0 len

-- | /O(n)/ Copy a 'ByteString' to a byte sequence.
fromByteString :: ByteString -> Bytes
fromByteString !b = Bytes
( runByteArrayST $ unsafeIOToST $ do
dst@(PM.MutableByteArray dst# ) <- PM.newByteArray len
ByteString.unsafeUseAsCString b $ \src -> do
PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst# ) 0 src len
PM.unsafeFreezeByteArray dst
) 0 len
where
!len = ByteString.length b
29 changes: 29 additions & 0 deletions src/Data/Bytes/Text/Ascii.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{-# LANGUAGE TypeApplications #-}

-- | This module treats 'Bytes' data as holding ASCII text. Providing bytes
-- outside the ASCII range (@U+0000@ -- @U+007F@) may cause a failure or
-- unspecified results, but such bytes will never be inspected.
--
-- For functions that can operate on ASCII-compatible encodings, see
-- 'Data.Bytes.Text.AsciiExt'.
module Data.Bytes.Text.Ascii
( fromString
) where

import Data.Bytes.Types (Bytes)
import Data.Char (ord)
import Data.Word (Word8)

import qualified Data.Bytes.Pure as Bytes
import qualified GHC.Exts as Exts


-- | Convert a 'String' consisting of only characters in the ASCII block
-- to a byte sequence. Any character with a codepoint above @U+007F@ is
-- replaced by @U+0000@.
fromString :: String -> Bytes
fromString = Bytes.fromByteArray
. Exts.fromList
. map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0)

-- TODO presumably also fromText and fromShortText
96 changes: 96 additions & 0 deletions src/Data/Bytes/Text/AsciiExt.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE RankNTypes #-}

-- | This module contains functions which operate on supersets of 'Bytes' containing ASCII-encoded text.
-- That is, none of the functions here inspect bytes with a value greater than 127, and do not fail due to the presence of such bytes.

-- For functions that can fail for bytes outside the ASCII range, see
-- 'Data.Bytes.Ascii'. For functions that can inspect bytes outside ASCII, see
-- any of the modules for ASCII-compatible encodings (e.g. 'Data.Bytes.Utf8',
-- 'Data.Bytes.Latin1', and so on).
module Data.Bytes.Text.AsciiExt
( -- * Line-Oriented IO
hFoldLines
, hForLines_
-- ** Standard Handles
, forLines_
, foldLines
-- * Text Manipulation
, toLowerU
) where

import Control.Monad.ST (ST)
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bytes.Types (Bytes(..))
import Data.Primitive (ByteArray)
import Data.Word (Word8)
import System.IO (Handle, hIsEOF, stdin)

import qualified Data.Bytes.Pure as Bytes
import qualified Data.ByteString.Char8 as BC8
import qualified Data.Primitive as PM

-- | `hForLines_` over `stdin`
forLines_ :: (Bytes -> IO a) -> IO ()
{-# INLINEABLE forLines_ #-}
forLines_ = hForLines_ stdin

-- | `hFoldLines` over `stdin`
foldLines :: a -> (a -> Bytes -> IO a) -> IO a
{-# INLINEABLE foldLines #-}
foldLines = hFoldLines stdin

-- | Perform an action on each line of the input, discarding results.
-- To maintain a running state, see 'hFoldLines'.
--
-- Lines are extracted with with 'BC8.hGetLine', which does not document its
-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dectection

-- delimited by a single @\n@ character (UNIX-style, as all things should be).
hForLines_ :: Handle -> (Bytes -> IO a) -> IO ()
hForLines_ h body = loop
where
loop = hIsEOF h >>= \case
False -> do
line <- Bytes.fromByteString <$> BC8.hGetLine h
_ <- body line
loop
True -> pure ()

-- | Perform an action on each line of the input, threading state through the computation.
-- If you do not need to keep a state, see `hForLines_`.
--
-- Lines are extracted with with 'BC8.hGetLine', which does not document its
-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are
-- delimited by a single @\n@ character (UNIX-style, as all things should be).
hFoldLines :: Handle -> a -> (a -> Bytes -> IO a) -> IO a
hFoldLines h z body = loop z
where
loop !x = hIsEOF h >>= \case
False -> do
line <- Bytes.fromByteString <$> BC8.hGetLine h
x' <- body x line
loop x'
True -> pure x

-- | /O(n)/ Convert ASCII letters to lowercase. This adds @0x20@ to bytes in the
-- range @[0x41,0x5A]@ (@A-Z@ ⇒ @a-z@) and leaves all other bytes alone.
-- Unconditionally copies the bytes.
toLowerU :: Bytes -> ByteArray
toLowerU (Bytes src off0 len0) =
runByteArrayST action
where
action :: forall s. ST s ByteArray
action = do
dst <- PM.newByteArray len0
let go !off !ix !len = if len == 0
then pure ()
else do
let w = PM.indexByteArray src off :: Word8
w' = if w >= 0x41 && w <= 0x5A
then w + 32
else w
PM.writeByteArray dst ix w'
go (off + 1) (ix + 1) (len - 1)
go off0 0 len0
PM.unsafeFreezeByteArray dst
Loading