diff --git a/byteslice.cabal b/byteslice.cabal index 62af985..85be68d 100644 --- a/byteslice.cabal +++ b/byteslice.cabal @@ -26,6 +26,11 @@ library Data.Bytes Data.Bytes.Chunks Data.Bytes.Mutable + Data.Bytes.Text.Ascii + Data.Bytes.Text.AsciiExt + Data.Bytes.Text.Latin1 + Data.Bytes.Text.Utf8 + Data.Bytes.Text.Windows1252 Data.Bytes.Types other-modules: Data.Bytes.Byte diff --git a/src/Data/Bytes.hs b/src/Data/Bytes.hs index ac1ef41..5affdb7 100644 --- a/src/Data/Bytes.hs +++ b/src/Data/Bytes.hs @@ -8,6 +8,13 @@ {-# language TypeApplications #-} {-# language UnboxedTuples #-} +-- | If you are interested in sub-arrays of 'ByteArray's (e.g. writing a binary +-- search), it would be grossly inefficient to make a copy of the sub-array. On +-- the other hand, it'd be really annoying to track limit indices by hand. +-- +-- This module defines the 'Bytes' type which exposes a standard array interface +-- for a sub-arrays without copying and without manual index manipulation. -- +-- For mutable arrays, see 'Data.Bytes.Mutable'. module Data.Bytes ( -- * Types Bytes @@ -41,12 +48,12 @@ module Data.Bytes , takeWhileEnd , dropWhileEnd -- * Folds - , foldl + , Pure.foldl , Pure.foldl' - , foldr - , foldr' + , Pure.foldr + , Pure.foldr' -- * Folds with Indices - , ifoldl' + , Pure.ifoldl' -- * Common Folds , elem -- * Splitting @@ -125,7 +132,7 @@ module Data.Bytes , fromCString# , Pure.toByteString , Pure.pinnedToByteString - , fromByteString + , Pure.fromByteString , fromShortByteString , toShortByteString , toShortByteStringClone @@ -139,30 +146,27 @@ module Data.Bytes import Prelude hiding (length,takeWhile,dropWhile,null,foldl,foldr,elem,replicate,any,all,readFile) import Control.Monad.Primitive (PrimMonad,primitive_,unsafeIOToPrim) -import Control.Monad.ST (ST) import Control.Monad.ST.Run (runByteArrayST) import Cstrlen (cstringLength#) import Data.Bits((.&.),(.|.),shiftL,finiteBitSize) -import Data.Bytes.Pure (length,fromByteArray) +import Data.Bytes.Pure (length,fromByteArray,foldr) import Data.Bytes.Types (Bytes(Bytes,array,offset)) -import Data.ByteString (ByteString) import Data.ByteString.Short.Internal (ShortByteString(SBS)) -import Data.Char (ord) import Data.Maybe (fromMaybe) import Data.Primitive (ByteArray(ByteArray)) import Foreign.C.String (CString) import Foreign.Ptr (Ptr,plusPtr,castPtr) import GHC.Exts (Addr#,Word#,Int#) -import GHC.Exts (Int(I#),Char(C#),Ptr(Ptr),word2Int#,chr#) -import GHC.IO (unsafeIOToST) +import GHC.Exts (Int(I#),Ptr(Ptr)) import GHC.Word (Word8(W8#),Word32) -import qualified Data.ByteString as ByteString -import qualified Data.ByteString.Unsafe as ByteString import qualified Data.Bytes.Byte as Byte import qualified Data.Bytes.Chunks as Chunks import qualified Data.Bytes.IO as BIO import qualified Data.Bytes.Pure as Pure +import qualified Data.Bytes.Text.Ascii as Ascii +import qualified Data.Bytes.Text.AsciiExt as AsciiExt +import qualified Data.Bytes.Text.Latin1 as Latin1 import qualified Data.Bytes.Types as Types import qualified Data.Foldable as F import qualified Data.List as List @@ -470,61 +474,27 @@ countWhileEnd k (Bytes arr off0 len0) = go (off0 + len0 - 1) (len0 - 1) 0 where else n else n --- | Left fold over bytes, non-strict in the accumulator. -foldl :: (a -> Word8 -> a) -> a -> Bytes -> a -{-# inline foldl #-} -foldl f a0 (Bytes arr off0 len0) = - go (off0 + len0 - 1) (len0 - 1) - where - go !off !ix = case ix of - (-1) -> a0 - _ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off) - --- | Right fold over bytes, non-strict in the accumulator. -foldr :: (Word8 -> a -> a) -> a -> Bytes -> a -{-# inline foldr #-} -foldr f a0 (Bytes arr off0 len0) = go off0 len0 where - go !off !len = case len of - 0 -> a0 - _ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1)) - --- | Left fold over bytes, strict in the accumulator. The reduction function --- is applied to each element along with its index. -ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a -{-# inline ifoldl' #-} -ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0 where - go !a !ix !off !len = case len of - 0 -> a - _ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1) - --- | Right fold over bytes, strict in the accumulator. -foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a -{-# inline foldr' #-} -foldr' f a0 (Bytes arr off0 len0) = - go a0 (off0 + len0 - 1) (len0 - 1) - where - go !a !off !ix = case ix of - (-1) -> a - _ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1) - -- | Convert a 'String' consisting of only characters in the ASCII block -- to a byte sequence. Any character with a codepoint above @U+007F@ is -- replaced by @U+0000@. fromAsciiString :: String -> Bytes -fromAsciiString = fromByteArray - . Exts.fromList - . map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0) +{-# DEPRECATED fromAsciiString "use Data.Bytes.Ascii.fromString instead" #-} +{-# INLINE fromAsciiString #-} +fromAsciiString = Ascii.fromString -- | Convert a 'String' consisting of only characters representable -- by ISO-8859-1. These are encoded with ISO-8859-1. Any character -- with a codepoint above @U+00FF@ is replaced by an unspecified byte. fromLatinString :: String -> Bytes -fromLatinString = - fromByteArray . Exts.fromList . map (fromIntegral @Int @Word8 . ord) +{-# DEPRECATED fromLatinString "use Data.Bytes.Latin1.fromString instead" #-} +{-# INLINE fromLatinString #-} +fromLatinString = Latin1.fromString -- | Interpret a byte sequence as text encoded by ISO-8859-1. toLatinString :: Bytes -> String -toLatinString = foldr (\(W8# w) xs -> C# (chr# (word2Int# w)) : xs) [] +{-# DEPRECATED toLatinString "use Data.Bytes.Latin1.toString instead" #-} +{-# INLINE toLatinString #-} +toLatinString = Latin1.toString -- | Copy a primitive string literal into managed memory. fromCString# :: Addr# -> Bytes @@ -546,153 +516,87 @@ compareByteArrays (ByteArray ba1#) (I# off1#) (ByteArray ba2#) (I# off2#) (I# n# -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a singleton whose element matches the character? equalsLatin1 :: Char -> Bytes -> Bool -{-# inline equalsLatin1 #-} -equalsLatin1 !c0 (Bytes arr off len) = case len of - 1 -> c0 == indexCharArray arr off - _ -> False +{-# DEPRECATED equalsLatin1 "use Data.Bytes.Text.Latin1.equals1 instead" #-} +{-# INLINE equalsLatin1 #-} +equalsLatin1 = Latin1.equals1 + -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a doubleton whose elements match the characters? equalsLatin2 :: Char -> Char -> Bytes -> Bool -equalsLatin2 !c0 !c1 (Bytes arr off len) = case len of - 2 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) - _ -> False +{-# DEPRECATED equalsLatin2 "use Data.Bytes.Text.Latin1.equals2 instead" #-} +{-# INLINE equalsLatin2 #-} +equalsLatin2 = Latin1.equals2 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a tripleton whose elements match the characters? equalsLatin3 :: Char -> Char -> Char -> Bytes -> Bool -equalsLatin3 !c0 !c1 !c2 (Bytes arr off len) = case len of - 3 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) - _ -> False +{-# DEPRECATED equalsLatin3 "use Data.Bytes.Text.Latin1.equals3 instead" #-} +{-# INLINE equalsLatin3 #-} +equalsLatin3 = Latin1.equals3 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a quadrupleton whose elements match the characters? equalsLatin4 :: Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin4 !c0 !c1 !c2 !c3 (Bytes arr off len) = case len of - 4 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) - _ -> False +{-# DEPRECATED equalsLatin4 "use Data.Bytes.Text.Latin1.equals4 instead" #-} +{-# INLINE equalsLatin4 #-} +equalsLatin4 = Latin1.equals4 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a quintupleton whose elements match the characters? equalsLatin5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin5 !c0 !c1 !c2 !c3 !c4 (Bytes arr off len) = case len of - 5 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) - _ -> False +{-# DEPRECATED equalsLatin5 "use Data.Bytes.Text.Latin1.equals5 instead" #-} +{-# INLINE equalsLatin5 #-} +equalsLatin5 = Latin1.equals5 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a sextupleton whose elements match the characters? equalsLatin6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin6 !c0 !c1 !c2 !c3 !c4 !c5 (Bytes arr off len) = case len of - 6 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) - _ -> False +{-# DEPRECATED equalsLatin6 "use Data.Bytes.Text.Latin1.equals6 instead" #-} +{-# INLINE equalsLatin6 #-} +equalsLatin6 = Latin1.equals6 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a septupleton whose elements match the characters? equalsLatin7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin7 !c0 !c1 !c2 !c3 !c4 !c5 !c6 (Bytes arr off len) = case len of - 7 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) - _ -> False +{-# DEPRECATED equalsLatin7 "use Data.Bytes.Text.Latin1.equals7 instead" #-} +{-# INLINE equalsLatin7 #-} +equalsLatin7 = Latin1.equals7 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- an octupleton whose elements match the characters? equalsLatin8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin8 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 (Bytes arr off len) = case len of - 8 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) && - c7 == indexCharArray arr (off + 7) - _ -> False +{-# DEPRECATED equalsLatin8 "use Data.Bytes.Text.Latin1.equals8 instead" #-} +{-# INLINE equalsLatin8 #-} +equalsLatin8 = Latin1.equals8 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 9-tuple whose elements match the characters? equalsLatin9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin9 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 (Bytes arr off len) = case len of - 9 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) && - c7 == indexCharArray arr (off + 7) && - c8 == indexCharArray arr (off + 8) - _ -> False +{-# DEPRECATED equalsLatin9 "use Data.Bytes.Text.Latin1.equals9 instead" #-} +{-# INLINE equalsLatin9 #-} +equalsLatin9 = Latin1.equals9 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 10-tuple whose elements match the characters? equalsLatin10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin10 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 (Bytes arr off len) = case len of - 10 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) && - c7 == indexCharArray arr (off + 7) && - c8 == indexCharArray arr (off + 8) && - c9 == indexCharArray arr (off + 9) - _ -> False +{-# DEPRECATED equalsLatin10 "use Data.Bytes.Text.Latin1.equals10 instead" #-} +{-# INLINE equalsLatin10 #-} +equalsLatin10 = Latin1.equals10 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 11-tuple whose elements match the characters? equalsLatin11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin11 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 (Bytes arr off len) = case len of - 11 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) && - c7 == indexCharArray arr (off + 7) && - c8 == indexCharArray arr (off + 8) && - c9 == indexCharArray arr (off + 9) && - c10 == indexCharArray arr (off + 10) - _ -> False +{-# DEPRECATED equalsLatin11 "use Data.Bytes.Text.Latin1.equals11 instead" #-} +{-# INLINE equalsLatin11 #-} +equalsLatin11 = Latin1.equals11 -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 12-tuple whose elements match the characters? equalsLatin12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool -equalsLatin12 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 (Bytes arr off len) = case len of - 12 -> c0 == indexCharArray arr off && - c1 == indexCharArray arr (off + 1) && - c2 == indexCharArray arr (off + 2) && - c3 == indexCharArray arr (off + 3) && - c4 == indexCharArray arr (off + 4) && - c5 == indexCharArray arr (off + 5) && - c6 == indexCharArray arr (off + 6) && - c7 == indexCharArray arr (off + 7) && - c8 == indexCharArray arr (off + 8) && - c9 == indexCharArray arr (off + 9) && - c10 == indexCharArray arr (off + 10) && - c11 == indexCharArray arr (off + 11) - _ -> False +{-# DEPRECATED equalsLatin12 "use Data.Bytes.Text.Latin1.equals12 instead" #-} +{-# INLINE equalsLatin12 #-} +equalsLatin12 = Latin1.equals12 -- | Is the byte sequence equal to the @NUL@-terminated C String? -- The C string must be a constant. @@ -719,15 +623,12 @@ stripCStringPrefix !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) False -> Nothing -- | Touch the byte array backing the byte sequence. This sometimes needed --- after calling 'contents' so that the @ByteArray@ does not get garbage +-- after calling 'Pure.contents' so that the @ByteArray@ does not get garbage -- collected. touch :: PrimMonad m => Bytes -> m () touch (Bytes (ByteArray arr) _ _) = unsafeIOToPrim (primitive_ (\s -> Exts.touch# arr s)) -indexCharArray :: ByteArray -> Int -> Char -indexCharArray (ByteArray arr) (I# off) = C# (Exts.indexCharArray# arr off) - -- | Read an entire file strictly into a 'Bytes'. readFile :: FilePath -> IO Bytes readFile f = Chunks.concat <$> Chunks.readFile f @@ -810,33 +711,6 @@ fromShortByteString (SBS x) = fromByteArray (ByteArray x) -- @[0x41,0x5A]@ and leaves all other bytes alone. Unconditionally -- copies the bytes. toLowerAsciiByteArrayClone :: Bytes -> ByteArray -toLowerAsciiByteArrayClone (Bytes src off0 len0) = - runByteArrayST action - where - action :: forall s. ST s ByteArray - action = do - dst <- PM.newByteArray len0 - let go !off !ix !len = if len == 0 - then pure () - else do - let w = PM.indexByteArray src off :: Word8 - w' = if w >= 0x41 && w <= 0x5A - then w + 32 - else w - PM.writeByteArray dst ix w' - go (off + 1) (ix + 1) (len - 1) - go off0 0 len0 - PM.unsafeFreezeByteArray dst - - --- | /O(n)/ Copy a 'ByteString' to a byte sequence. -fromByteString :: ByteString -> Bytes -fromByteString !b = Bytes - ( runByteArrayST $ unsafeIOToST $ do - dst@(PM.MutableByteArray dst# ) <- PM.newByteArray len - ByteString.unsafeUseAsCString b $ \src -> do - PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst# ) 0 src len - PM.unsafeFreezeByteArray dst - ) 0 len - where - !len = ByteString.length b +{-# DEPRECATED toLowerAsciiByteArrayClone "use Data.Bytes/Text/AsciiExt.toLowerU" #-} +{-# INLINE toLowerAsciiByteArrayClone #-} +toLowerAsciiByteArrayClone = AsciiExt.toLowerU diff --git a/src/Data/Bytes/Chunks.hs b/src/Data/Bytes/Chunks.hs index 46e1296..a171e44 100644 --- a/src/Data/Bytes/Chunks.hs +++ b/src/Data/Bytes/Chunks.hs @@ -290,7 +290,7 @@ fnv1a64 !b = foldl' ) 0xcbf29ce484222325 b -- | Outputs 'Chunks' to the specified 'Handle'. This is implemented --- with 'hPutBuf'. +-- with 'IO.hPut'. hPut :: Handle -> Chunks -> IO () hPut h = go where go ChunksNil = pure () diff --git a/src/Data/Bytes/IO.hs b/src/Data/Bytes/IO.hs index 27df734..37457eb 100644 --- a/src/Data/Bytes/IO.hs +++ b/src/Data/Bytes/IO.hs @@ -22,12 +22,12 @@ import qualified GHC.Exts as Exts import qualified Data.Primitive as PM -- | Read 'Bytes' directly from the specified 'Handle'. The resulting --- 'Bytes' are pinned. This is implemented with 'hGetBuf'. +-- 'Bytes' are pinned. This is implemented with 'IO.hGetBuf'. hGet :: Handle -> Int -> IO Bytes hGet h i = createPinnedAndTrim i (\p -> IO.hGetBuf h p i) -- | Outputs 'Bytes' to the specified 'Handle'. This is implemented --- with 'hPutBuf'. +-- with 'IO.hPutBuf'. hPut :: Handle -> Bytes -> IO () hPut h b0 = do let b1@(Bytes arr _ len) = pin b0 @@ -52,4 +52,3 @@ touchMutableByteArrayIO (PM.MutableByteArray x) = touchByteArrayIO :: ByteArray -> IO () touchByteArrayIO (ByteArray x) = IO (\s -> (# Exts.touch# x s, () #)) - diff --git a/src/Data/Bytes/Mutable.hs b/src/Data/Bytes/Mutable.hs index d2b190f..abce95c 100644 --- a/src/Data/Bytes/Mutable.hs +++ b/src/Data/Bytes/Mutable.hs @@ -1,6 +1,13 @@ {-# language BangPatterns #-} {-# language LambdaCase #-} +-- | If you are interested in sub-arrays of 'MutableByteArray's (e.g. writing +-- quicksort), it would be grossly inefficient to make a copy of the sub-array. +-- On the other hand, it'd be really annoying to track limit indices by hand. +-- +-- This module defines the 'MutableBytes' type which exposes a standard array +-- interface for a sub-arrays without copying and without manual index +-- manipulation. For immutable arrays, see 'Data.Bytes'. module Data.Bytes.Mutable ( -- * Types MutableBytes diff --git a/src/Data/Bytes/Pure.hs b/src/Data/Bytes/Pure.hs index 969a21b..a277d24 100644 --- a/src/Data/Bytes/Pure.hs +++ b/src/Data/Bytes/Pure.hs @@ -18,26 +18,35 @@ module Data.Bytes.Pure , toPinnedByteArrayClone , fromByteArray , length + , foldl , foldl' + , foldr + , ifoldl' + , foldr' , fnv1a32 , fnv1a64 , toByteString , pinnedToByteString + , fromByteString ) where -import Prelude hiding (length) +import Prelude hiding (length,foldl,foldr) import Control.Monad.Primitive (PrimState,PrimMonad) import Control.Monad.ST.Run (runByteArrayST) import Data.Bits (xor) -import Data.ByteString (ByteString) import Data.Bytes.Types (Bytes(Bytes)) +import Data.ByteString (ByteString) import Data.Primitive (ByteArray,MutableByteArray) import Data.Word (Word64,Word32,Word8) import Foreign.Ptr (Ptr,plusPtr) +import GHC.IO (unsafeIOToST) +import qualified Data.ByteString as ByteString import qualified Data.ByteString.Internal as ByteString +import qualified Data.ByteString.Unsafe as ByteString import qualified Data.Primitive as PM +import qualified Data.Primitive.Ptr as PM import qualified GHC.Exts as Exts import qualified GHC.ForeignPtr as ForeignPtr @@ -124,6 +133,16 @@ fnv1a64 !b = foldl' (\acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3 ) 0xcbf29ce484222325 b +-- | Left fold over bytes, non-strict in the accumulator. +foldl :: (a -> Word8 -> a) -> a -> Bytes -> a +{-# inline foldl #-} +foldl f a0 (Bytes arr off0 len0) = + go (off0 + len0 - 1) (len0 - 1) + where + go !off !ix = case ix of + (-1) -> a0 + _ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off) + -- | Left fold over bytes, strict in the accumulator. foldl' :: (a -> Word8 -> a) -> a -> Bytes -> a {-# inline foldl' #-} @@ -132,6 +151,34 @@ foldl' f a0 (Bytes arr off0 len0) = go a0 off0 len0 where 0 -> a _ -> go (f a (PM.indexByteArray arr off)) (off + 1) (len - 1) +-- | Right fold over bytes, non-strict in the accumulator. +foldr :: (Word8 -> a -> a) -> a -> Bytes -> a +{-# inline foldr #-} +foldr f a0 (Bytes arr off0 len0) = go off0 len0 where + go !off !len = case len of + 0 -> a0 + _ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1)) + +-- | Left fold over bytes, strict in the accumulator. The reduction function +-- is applied to each element along with its index. +ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a +{-# inline ifoldl' #-} +ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0 where + go !a !ix !off !len = case len of + 0 -> a + _ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1) + +-- | Right fold over bytes, strict in the accumulator. +foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a +{-# inline foldr' #-} +foldr' f a0 (Bytes arr off0 len0) = + go a0 (off0 + len0 - 1) (len0 - 1) + where + go !a !off !ix = case ix of + (-1) -> a + _ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1) + + -- | Yields a pointer to the beginning of the byte sequence. It is only safe -- to call this on a 'Bytes' backed by a pinned @ByteArray@. contents :: Bytes -> Ptr Word8 @@ -162,7 +209,8 @@ toPinnedByteArrayClone (Bytes arr off len) = runByteArrayST $ do toByteString :: Bytes -> ByteString toByteString !b = pinnedToByteString (pin b) --- | /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise. +-- | Convert a pinned 'Bytes' to a 'ByteString' +-- /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise. pinnedToByteString :: Bytes -> ByteString pinnedToByteString (Bytes y@(PM.ByteArray x) off len) = ByteString.PS @@ -171,3 +219,15 @@ pinnedToByteString (Bytes y@(PM.ByteArray x) off len) = (ForeignPtr.PlainPtr (Exts.unsafeCoerce# x)) ) 0 len + +-- | /O(n)/ Copy a 'ByteString' to a byte sequence. +fromByteString :: ByteString -> Bytes +fromByteString !b = Bytes + ( runByteArrayST $ unsafeIOToST $ do + dst@(PM.MutableByteArray dst# ) <- PM.newByteArray len + ByteString.unsafeUseAsCString b $ \src -> do + PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst# ) 0 src len + PM.unsafeFreezeByteArray dst + ) 0 len + where + !len = ByteString.length b diff --git a/src/Data/Bytes/Text/Ascii.hs b/src/Data/Bytes/Text/Ascii.hs new file mode 100644 index 0000000..c50d96e --- /dev/null +++ b/src/Data/Bytes/Text/Ascii.hs @@ -0,0 +1,29 @@ +{-# LANGUAGE TypeApplications #-} + +-- | This module treats 'Bytes' data as holding ASCII text. Providing bytes +-- outside the ASCII range (@U+0000@ -- @U+007F@) may cause a failure or +-- unspecified results, but such bytes will never be inspected. +-- +-- For functions that can operate on ASCII-compatible encodings, see +-- 'Data.Bytes.Text.AsciiExt'. +module Data.Bytes.Text.Ascii + ( fromString + ) where + +import Data.Bytes.Types (Bytes) +import Data.Char (ord) +import Data.Word (Word8) + +import qualified Data.Bytes.Pure as Bytes +import qualified GHC.Exts as Exts + + +-- | Convert a 'String' consisting of only characters in the ASCII block +-- to a byte sequence. Any character with a codepoint above @U+007F@ is +-- replaced by @U+0000@. +fromString :: String -> Bytes +fromString = Bytes.fromByteArray + . Exts.fromList + . map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0) + +-- TODO presumably also fromText and fromShortText diff --git a/src/Data/Bytes/Text/AsciiExt.hs b/src/Data/Bytes/Text/AsciiExt.hs new file mode 100644 index 0000000..5916bb5 --- /dev/null +++ b/src/Data/Bytes/Text/AsciiExt.hs @@ -0,0 +1,96 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE RankNTypes #-} + +-- | This module contains functions which operate on supersets of 'Bytes' containing ASCII-encoded text. +-- That is, none of the functions here inspect bytes with a value greater than 127, and do not fail due to the presence of such bytes. + +-- For functions that can fail for bytes outside the ASCII range, see +-- 'Data.Bytes.Ascii'. For functions that can inspect bytes outside ASCII, see +-- any of the modules for ASCII-compatible encodings (e.g. 'Data.Bytes.Utf8', +-- 'Data.Bytes.Latin1', and so on). +module Data.Bytes.Text.AsciiExt + ( -- * Line-Oriented IO + hFoldLines + , hForLines_ + -- ** Standard Handles + , forLines_ + , foldLines + -- * Text Manipulation + , toLowerU + ) where + +import Control.Monad.ST (ST) +import Control.Monad.ST.Run (runByteArrayST) +import Data.Bytes.Types (Bytes(..)) +import Data.Primitive (ByteArray) +import Data.Word (Word8) +import System.IO (Handle, hIsEOF, stdin) + +import qualified Data.Bytes.Pure as Bytes +import qualified Data.ByteString.Char8 as BC8 +import qualified Data.Primitive as PM + +-- | `hForLines_` over `stdin` +forLines_ :: (Bytes -> IO a) -> IO () +{-# INLINEABLE forLines_ #-} +forLines_ = hForLines_ stdin + +-- | `hFoldLines` over `stdin` +foldLines :: a -> (a -> Bytes -> IO a) -> IO a +{-# INLINEABLE foldLines #-} +foldLines = hFoldLines stdin + +-- | Perform an action on each line of the input, discarding results. +-- To maintain a running state, see 'hFoldLines'. +-- +-- Lines are extracted with with 'BC8.hGetLine', which does not document its +-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are +-- delimited by a single @\n@ character (UNIX-style, as all things should be). +hForLines_ :: Handle -> (Bytes -> IO a) -> IO () +hForLines_ h body = loop + where + loop = hIsEOF h >>= \case + False -> do + line <- Bytes.fromByteString <$> BC8.hGetLine h + _ <- body line + loop + True -> pure () + +-- | Perform an action on each line of the input, threading state through the computation. +-- If you do not need to keep a state, see `hForLines_`. +-- +-- Lines are extracted with with 'BC8.hGetLine', which does not document its +-- dectection algorithm. As of writing (bytestring v0.11.1.0), lines are +-- delimited by a single @\n@ character (UNIX-style, as all things should be). +hFoldLines :: Handle -> a -> (a -> Bytes -> IO a) -> IO a +hFoldLines h z body = loop z + where + loop !x = hIsEOF h >>= \case + False -> do + line <- Bytes.fromByteString <$> BC8.hGetLine h + x' <- body x line + loop x' + True -> pure x + +-- | /O(n)/ Convert ASCII letters to lowercase. This adds @0x20@ to bytes in the +-- range @[0x41,0x5A]@ (@A-Z@ ⇒ @a-z@) and leaves all other bytes alone. +-- Unconditionally copies the bytes. +toLowerU :: Bytes -> ByteArray +toLowerU (Bytes src off0 len0) = + runByteArrayST action + where + action :: forall s. ST s ByteArray + action = do + dst <- PM.newByteArray len0 + let go !off !ix !len = if len == 0 + then pure () + else do + let w = PM.indexByteArray src off :: Word8 + w' = if w >= 0x41 && w <= 0x5A + then w + 32 + else w + PM.writeByteArray dst ix w' + go (off + 1) (ix + 1) (len - 1) + go off0 0 len0 + PM.unsafeFreezeByteArray dst diff --git a/src/Data/Bytes/Text/Latin1.hs b/src/Data/Bytes/Text/Latin1.hs new file mode 100644 index 0000000..ba7ce1e --- /dev/null +++ b/src/Data/Bytes/Text/Latin1.hs @@ -0,0 +1,214 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE TypeApplications #-} + +-- | This module treats 'Bytes' data as holding text encoded in ISO-8859-1. This +-- encoding can only encode codepoints strictly below @U+0100@, but this allows +-- each codepoint to be placed directly into a single byte. This range consists +-- of Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls, which includes +-- ASCII. +-- +-- Strictly, ISO-8859-1 is not to be confused with ISO/IEC 8859-1 (which was the +-- default encoding for webpages before HTML5). ISO/IEC 8859-1 lacks encodings +-- for the C0 and C1 control characters. +-- +-- With HTML5, the default encoding of webpages was changed to Windows-1252, +-- which is _not_ compatible with ISO-8859-1. Windows-1252 uses the C1 Control +-- range (@U+0080@ -- @U+009F@) mostly to encode a variety of printable +-- characters. For this encoding, see 'Data.Bytes.Text.Windows1252'. +module Data.Bytes.Text.Latin1 + ( toString + , fromString + -- * Specialized Comparisons + , equals1 + , equals2 + , equals3 + , equals4 + , equals5 + , equals6 + , equals7 + , equals8 + , equals9 + , equals10 + , equals11 + , equals12 + ) where + +import Data.Bytes.Types (Bytes(..)) +import Data.Char (ord) +import Data.Primitive (ByteArray(ByteArray)) +import GHC.Exts (Int(I#),Char(C#),word2Int#,chr#) +import GHC.Word (Word8(W8#)) + +import qualified Data.Bytes.Pure as Bytes +import qualified GHC.Exts as Exts + + +-- | Convert a 'String' consisting of only characters representable +-- by ISO-8859-1. These are encoded with ISO-8859-1. Any character +-- with a codepoint above @U+00FF@ is replaced by an unspecified byte. +fromString :: String -> Bytes +fromString = + Bytes.fromByteArray . Exts.fromList . map (fromIntegral @Int @Word8 . ord) + +-- | Interpret a byte sequence as text encoded by ISO-8859-1. +toString :: Bytes -> String +{-# INLINE toString #-} +toString = Bytes.foldr (\(W8# w) xs -> C# (chr# (word2Int# w)) : xs) [] + +-- TODO presumably also fromText and fromShortText + + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a singleton whose element matches the character? +equals1 :: Char -> Bytes -> Bool +{-# INLINE equals1 #-} +equals1 !c0 (Bytes arr off len) = case len of + 1 -> c0 == indexCharArray arr off + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a doubleton whose elements match the characters? +equals2 :: Char -> Char -> Bytes -> Bool +equals2 !c0 !c1 (Bytes arr off len) = case len of + 2 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a tripleton whose elements match the characters? +equals3 :: Char -> Char -> Char -> Bytes -> Bool +equals3 !c0 !c1 !c2 (Bytes arr off len) = case len of + 3 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a quadrupleton whose elements match the characters? +equals4 :: Char -> Char -> Char -> Char -> Bytes -> Bool +equals4 !c0 !c1 !c2 !c3 (Bytes arr off len) = case len of + 4 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a quintupleton whose elements match the characters? +equals5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals5 !c0 !c1 !c2 !c3 !c4 (Bytes arr off len) = case len of + 5 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a sextupleton whose elements match the characters? +equals6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals6 !c0 !c1 !c2 !c3 !c4 !c5 (Bytes arr off len) = case len of + 6 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a septupleton whose elements match the characters? +equals7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals7 !c0 !c1 !c2 !c3 !c4 !c5 !c6 (Bytes arr off len) = case len of + 7 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- an octupleton whose elements match the characters? +equals8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals8 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 (Bytes arr off len) = case len of + 8 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) && + c7 == indexCharArray arr (off + 7) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a 9-tuple whose elements match the characters? +equals9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals9 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 (Bytes arr off len) = case len of + 9 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) && + c7 == indexCharArray arr (off + 7) && + c8 == indexCharArray arr (off + 8) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a 10-tuple whose elements match the characters? +equals10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals10 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 (Bytes arr off len) = case len of + 10 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) && + c7 == indexCharArray arr (off + 7) && + c8 == indexCharArray arr (off + 8) && + c9 == indexCharArray arr (off + 9) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a 11-tuple whose elements match the characters? +equals11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals11 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 (Bytes arr off len) = case len of + 11 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) && + c7 == indexCharArray arr (off + 7) && + c8 == indexCharArray arr (off + 8) && + c9 == indexCharArray arr (off + 9) && + c10 == indexCharArray arr (off + 10) + _ -> False + +-- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, +-- a 12-tuple whose elements match the characters? +equals12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool +equals12 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 (Bytes arr off len) = case len of + 12 -> c0 == indexCharArray arr off && + c1 == indexCharArray arr (off + 1) && + c2 == indexCharArray arr (off + 2) && + c3 == indexCharArray arr (off + 3) && + c4 == indexCharArray arr (off + 4) && + c5 == indexCharArray arr (off + 5) && + c6 == indexCharArray arr (off + 6) && + c7 == indexCharArray arr (off + 7) && + c8 == indexCharArray arr (off + 8) && + c9 == indexCharArray arr (off + 9) && + c10 == indexCharArray arr (off + 10) && + c11 == indexCharArray arr (off + 11) + _ -> False + +indexCharArray :: ByteArray -> Int -> Char +indexCharArray (ByteArray arr) (I# off) = C# (Exts.indexCharArray# arr off) diff --git a/src/Data/Bytes/Text/Utf8.hs b/src/Data/Bytes/Text/Utf8.hs new file mode 100644 index 0000000..495c438 --- /dev/null +++ b/src/Data/Bytes/Text/Utf8.hs @@ -0,0 +1,5 @@ +-- | Placeholder module in case there is demand for treating 'Bytes' as +-- UTF8-encoded text +module Data.Bytes.Text.Utf8 () where + +import Data.Bytes.Types (Bytes) diff --git a/src/Data/Bytes/Text/Windows1252.hs b/src/Data/Bytes/Text/Windows1252.hs new file mode 100644 index 0000000..f4d3053 --- /dev/null +++ b/src/Data/Bytes/Text/Windows1252.hs @@ -0,0 +1,5 @@ +-- | Placeholder module in case there is demand for treating 'Bytes' as +-- Windows-1252-encoded text +module Data.Bytes.Text.Windows1252 () where + +import Data.Bytes.Types (Bytes) \ No newline at end of file