byteverse · andrewthad · Jul 26, 2021 · Jul 23, 2021 · Jul 23, 2021 · Jul 23, 2021
diff --git a/byteslice.cabal b/byteslice.cabal
@@ -26,6 +26,11 @@ library
     Data.Bytes
     Data.Bytes.Chunks
     Data.Bytes.Mutable
+    Data.Bytes.Text.Ascii
+    Data.Bytes.Text.AsciiExt
+    Data.Bytes.Text.Latin1
+    Data.Bytes.Text.Utf8
+    Data.Bytes.Text.Windows1252
     Data.Bytes.Types
   other-modules:
     Data.Bytes.Byte

diff --git a/src/Data/Bytes.hs b/src/Data/Bytes.hs
diff --git a/src/Data/Bytes/Chunks.hs b/src/Data/Bytes/Chunks.hs
@@ -290,7 +290,7 @@ fnv1a64 !b = foldl'
   ) 0xcbf29ce484222325 b
 
 -- | Outputs 'Chunks' to the specified 'Handle'. This is implemented
--- with 'hPutBuf'.
+-- with 'IO.hPut'.
 hPut :: Handle -> Chunks -> IO ()
 hPut h = go where
   go ChunksNil = pure ()

diff --git a/src/Data/Bytes/IO.hs b/src/Data/Bytes/IO.hs
@@ -22,12 +22,12 @@ import qualified GHC.Exts as Exts
 import qualified Data.Primitive as PM
 
 -- | Read 'Bytes' directly from the specified 'Handle'. The resulting
--- 'Bytes' are pinned. This is implemented with 'hGetBuf'.
+-- 'Bytes' are pinned. This is implemented with 'IO.hGetBuf'.
 hGet :: Handle -> Int -> IO Bytes
 hGet h i = createPinnedAndTrim i (\p -> IO.hGetBuf h p i)
 
 -- | Outputs 'Bytes' to the specified 'Handle'. This is implemented
--- with 'hPutBuf'.
+-- with 'IO.hPutBuf'.
 hPut :: Handle -> Bytes -> IO ()
 hPut h b0 = do
   let b1@(Bytes arr _ len) = pin b0
@@ -52,4 +52,3 @@ touchMutableByteArrayIO (PM.MutableByteArray x) =
 touchByteArrayIO :: ByteArray -> IO ()
 touchByteArrayIO (ByteArray x) =
   IO (\s -> (# Exts.touch# x s, () #))
-
diff --git a/src/Data/Bytes/Mutable.hs b/src/Data/Bytes/Mutable.hs
@@ -1,6 +1,13 @@
 {-# language BangPatterns #-}
 {-# language LambdaCase #-}
 
+-- | If you are interested in sub-arrays of 'MutableByteArray's (e.g. writing
+-- quicksort), it would be grossly inefficient to make a copy of the sub-array.
+-- On the other hand, it'd be really annoying to track limit indices by hand.
+--
+-- This module defines the 'MutableBytes' type which exposes a standard array
+-- interface for a sub-arrays without copying and without manual index
+-- manipulation. For immutable arrays, see 'Data.Bytes'.
 module Data.Bytes.Mutable
   ( -- * Types
     MutableBytes

diff --git a/src/Data/Bytes/Pure.hs b/src/Data/Bytes/Pure.hs
@@ -18,26 +18,35 @@ module Data.Bytes.Pure
   , toPinnedByteArrayClone
   , fromByteArray
   , length
+  , foldl
   , foldl'
+  , foldr
+  , ifoldl'
+  , foldr'
   , fnv1a32
   , fnv1a64
   , toByteString
   , pinnedToByteString
+  , fromByteString
   ) where
 
-import Prelude hiding (length)
+import Prelude hiding (length,foldl,foldr)
 
 import Control.Monad.Primitive (PrimState,PrimMonad)
 import Control.Monad.ST.Run (runByteArrayST)
 import Data.Bits (xor)
-import Data.ByteString (ByteString)
 import Data.Bytes.Types (Bytes(Bytes))
+import Data.ByteString (ByteString)
 import Data.Primitive (ByteArray,MutableByteArray)
 import Data.Word (Word64,Word32,Word8)
 import Foreign.Ptr (Ptr,plusPtr)
+import GHC.IO (unsafeIOToST)
 
+import qualified Data.ByteString as ByteString
 import qualified Data.ByteString.Internal as ByteString
+import qualified Data.ByteString.Unsafe as ByteString
 import qualified Data.Primitive as PM
+import qualified Data.Primitive.Ptr as PM
 import qualified GHC.Exts as Exts
 import qualified GHC.ForeignPtr as ForeignPtr
 
@@ -124,6 +133,16 @@ fnv1a64 !b = foldl'
   (\acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3
   ) 0xcbf29ce484222325 b
 
+-- | Left fold over bytes, non-strict in the accumulator.
+foldl :: (a -> Word8 -> a) -> a -> Bytes -> a
+{-# inline foldl #-}
+foldl f a0 (Bytes arr off0 len0) =
+  go (off0 + len0 - 1) (len0 - 1) 
+  where
+  go !off !ix = case ix of
+    (-1) -> a0
+    _ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off)
+
 -- | Left fold over bytes, strict in the accumulator.
 foldl' :: (a -> Word8 -> a) -> a -> Bytes -> a
 {-# inline foldl' #-}
@@ -132,6 +151,34 @@ foldl' f a0 (Bytes arr off0 len0) = go a0 off0 len0 where
     0 -> a
     _ -> go (f a (PM.indexByteArray arr off)) (off + 1) (len - 1)
 
+-- | Right fold over bytes, non-strict in the accumulator.
+foldr :: (Word8 -> a -> a) -> a -> Bytes -> a
+{-# inline foldr #-}
+foldr f a0 (Bytes arr off0 len0) = go off0 len0 where
+  go !off !len = case len of
+    0 -> a0
+    _ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1))
+
+-- | Left fold over bytes, strict in the accumulator. The reduction function
+-- is applied to each element along with its index.
+ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a
+{-# inline ifoldl' #-}
+ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0 where
+  go !a !ix !off !len = case len of
+    0 -> a
+    _ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1)
+
+-- | Right fold over bytes, strict in the accumulator.
+foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a
+{-# inline foldr' #-}
+foldr' f a0 (Bytes arr off0 len0) =
+  go a0 (off0 + len0 - 1) (len0 - 1) 
+  where
+  go !a !off !ix = case ix of
+    (-1) -> a
+    _ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1)
+
+
 -- | Yields a pointer to the beginning of the byte sequence. It is only safe
 -- to call this on a 'Bytes' backed by a pinned @ByteArray@.
 contents :: Bytes -> Ptr Word8
@@ -162,7 +209,8 @@ toPinnedByteArrayClone (Bytes arr off len) = runByteArrayST $ do
 toByteString :: Bytes -> ByteString
 toByteString !b = pinnedToByteString (pin b)
 
--- | /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise.
+-- | Convert a pinned 'Bytes' to a 'ByteString'
+-- /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise.
 pinnedToByteString :: Bytes -> ByteString
 pinnedToByteString (Bytes y@(PM.ByteArray x) off len) =
   ByteString.PS
@@ -171,3 +219,15 @@ pinnedToByteString (Bytes y@(PM.ByteArray x) off len) =
       (ForeignPtr.PlainPtr (Exts.unsafeCoerce# x))
     )
     0 len
+
+-- | /O(n)/ Copy a 'ByteString' to a byte sequence.
+fromByteString :: ByteString -> Bytes
+fromByteString !b = Bytes
+  ( runByteArrayST $ unsafeIOToST $ do 
+      dst@(PM.MutableByteArray dst# ) <- PM.newByteArray len
+      ByteString.unsafeUseAsCString b $ \src -> do
+        PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst# ) 0 src len
+      PM.unsafeFreezeByteArray dst
+  ) 0 len
+  where
+  !len = ByteString.length b
diff --git a/src/Data/Bytes/Text/Ascii.hs b/src/Data/Bytes/Text/Ascii.hs
@@ -0,0 +1,29 @@
+{-# LANGUAGE TypeApplications #-}
+
+-- | This module treats 'Bytes' data as holding ASCII text. Providing bytes
+-- outside the ASCII range (@U+0000@ -- @U+007F@) may cause a failure or
+-- unspecified results, but such bytes will never be inspected.
+--
+-- For functions that can operate on ASCII-compatible encodings, see
+-- 'Data.Bytes.Text.AsciiExt'.
+module Data.Bytes.Text.Ascii
+  ( fromString
+  ) where
+
+import Data.Bytes.Types (Bytes)
+import Data.Char (ord)
+import Data.Word (Word8)
+
+import qualified Data.Bytes.Pure as Bytes
+import qualified GHC.Exts as Exts
+
+
+-- | Convert a 'String' consisting of only characters in the ASCII block
+-- to a byte sequence. Any character with a codepoint above @U+007F@ is
+-- replaced by @U+0000@.
+fromString :: String -> Bytes
+fromString = Bytes.fromByteArray
+  . Exts.fromList
+  . map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0)
+
+-- TODO presumably also fromText and fromShortText
diff --git a/src/Data/Bytes/Text/AsciiExt.hs b/src/Data/Bytes/Text/AsciiExt.hs
@@ -0,0 +1,96 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE RankNTypes #-}
+
+-- | This module contains functions which operate on supersets of 'Bytes' containing ASCII-encoded text.
+-- That is, none of the functions here inspect bytes with a value greater than 127, and do not fail due to the presence of such bytes.
+
+-- For functions that can fail for bytes outside the ASCII range, see
+-- 'Data.Bytes.Ascii'. For functions that can inspect bytes outside ASCII, see
+-- any of the modules for ASCII-compatible encodings (e.g. 'Data.Bytes.Utf8',
+-- 'Data.Bytes.Latin1', and so on).
+module Data.Bytes.Text.AsciiExt
+  ( -- * Line-Oriented IO
+    hFoldLines
+  , hForLines_
+  -- ** Standard Handles
+  , forLines_
+  , foldLines
+  -- * Text Manipulation
+  , toLowerU
+  ) where
+
+import Control.Monad.ST (ST)
+import Control.Monad.ST.Run (runByteArrayST)
+import Data.Bytes.Types (Bytes(..))
+import Data.Primitive (ByteArray)
+import Data.Word (Word8)
+import System.IO (Handle, hIsEOF, stdin)
+
+import qualified Data.Bytes.Pure as Bytes
+import qualified Data.ByteString.Char8 as BC8
+import qualified Data.Primitive as PM
+
+-- | `hForLines_` over `stdin`
+forLines_ :: (Bytes -> IO a) -> IO ()
+{-# INLINEABLE forLines_ #-}
+forLines_ = hForLines_ stdin
+
+-- | `hFoldLines` over `stdin`
+foldLines :: a -> (a -> Bytes -> IO a) -> IO a
+{-# INLINEABLE foldLines #-}
+foldLines = hFoldLines stdin
+
+-- | Perform an action on each line of the input, discarding results.
+-- To maintain a running state, see 'hFoldLines'.
+--
+-- Lines are extracted with with 'BC8.hGetLine', which does not document its
+-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are
+-- delimited by a single @\n@ character (UNIX-style, as all things should be).
+hForLines_ :: Handle -> (Bytes -> IO a) -> IO ()
+hForLines_ h body = loop
+  where
+  loop = hIsEOF h >>= \case
+    False -> do
+      line <- Bytes.fromByteString <$> BC8.hGetLine h
+      _ <- body line
+      loop
+    True -> pure ()
+
+-- | Perform an action on each line of the input, threading state through the computation.
+-- If you do not need to keep a state, see `hForLines_`.
+--
+-- Lines are extracted with with 'BC8.hGetLine', which does not document its
+-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are
+-- delimited by a single @\n@ character (UNIX-style, as all things should be).
+hFoldLines :: Handle -> a -> (a -> Bytes -> IO a) -> IO a
+hFoldLines h z body = loop z
+  where
+  loop !x = hIsEOF h >>= \case
+    False -> do
+      line <- Bytes.fromByteString <$> BC8.hGetLine h
+      x' <- body x line
+      loop x'
+    True -> pure x
+
+-- | /O(n)/ Convert ASCII letters to lowercase. This adds @0x20@ to bytes in the
+-- range @[0x41,0x5A]@ (@A-Z@ ⇒ @a-z@) and leaves all other bytes alone.
+-- Unconditionally copies the bytes.
+toLowerU :: Bytes -> ByteArray
+toLowerU (Bytes src off0 len0) =
+  runByteArrayST action
+  where
+  action :: forall s. ST s ByteArray
+  action = do
+    dst <- PM.newByteArray len0
+    let go !off !ix !len = if len == 0
+          then pure ()
+          else do
+            let w = PM.indexByteArray src off :: Word8
+                w' = if w >= 0x41 && w <= 0x5A
+                  then w + 32
+                  else w
+            PM.writeByteArray dst ix w'
+            go (off + 1) (ix + 1) (len - 1)
+    go off0 0 len0
+    PM.unsafeFreezeByteArray dst