Skip to content

Commit

Permalink
Add functions to AsciiExt for splitting and breaking
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewthad committed Feb 26, 2024
1 parent ad4523c commit eddd468
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 102 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Revision history for byteslice

## 0.2.14.0 -- 2024-02-26

* Add functions to `Data.Bytes.Text.AsciiExt`: `split(1|2|3|4)`,
`splitTetragram1`, `anyEq`, `takeWhileNotEq`, `dropWhileNotEq`,
`takeWhileEndNotEq`, dropWhileEndEq`.

## 0.2.13.2 -- 2024-02-06

* Restore `Data.Bytes.Text.Utf8.toText`.
Expand Down
2 changes: 1 addition & 1 deletion byteslice.cabal
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cabal-version: 2.4
name: byteslice
version: 0.2.13.2
version: 0.2.14.0
synopsis: Slicing managed and unmanaged memory
description:
This library provides types that allow the user to talk about a slice of
Expand Down
110 changes: 10 additions & 100 deletions src/Data/Bytes.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ module Data.Bytes
, unsnoc

-- * Predicates
, any
, all
, Pure.any
, Pure.all

-- * Create

Expand Down Expand Up @@ -89,7 +89,7 @@ module Data.Bytes

-- ** Fixed from Beginning
, Byte.split1
, splitTetragram1
, Pure.splitTetragram1
, Byte.split2
, Byte.split3
, Byte.split4
Expand All @@ -106,7 +106,7 @@ module Data.Bytes
-- * Searching
, replace
, findIndices
, findTetragramIndex
, Pure.findTetragramIndex

-- * Counting
, Byte.count
Expand Down Expand Up @@ -207,17 +207,16 @@ import Prelude hiding (all, any, dropWhile, elem, foldl, foldr, length, map, nul
import Control.Monad.Primitive (PrimMonad, primitive_, unsafeIOToPrim)
import Control.Monad.ST.Run (runByteArrayST)
import Cstrlen (cstringLength#)
import Data.Bits (unsafeShiftL, (.|.))
import Data.ByteString.Short.Internal (ShortByteString (SBS))
import Data.Bytes.Pure (foldr, fromByteArray, length, toShortByteString, unsafeDrop, unsafeIndex)
import Data.Bytes.Pure (fromByteArray, length, toShortByteString, unsafeDrop, unsafeIndex)
import Data.Bytes.Search (findIndices, isInfixOf, replace)
import Data.Bytes.Types (ByteArrayN (ByteArrayN), Bytes (Bytes, array, offset), BytesN (BytesN))
import Data.Primitive (Array, ByteArray (ByteArray))
import Data.Text.Short (ShortText)
import Foreign.C.String (CString)
import Foreign.Ptr (Ptr, castPtr, plusPtr)
import GHC.Exts (Addr#, Int (I#), Int#, Ptr (Ptr), Word#)
import GHC.Word (Word32, Word8 (W8#))
import GHC.Word (Word8 (W8#))
import Reps (Bytes# (..), word8ToWord#)

import qualified Arithmetic.Nat as Nat
Expand Down Expand Up @@ -402,60 +401,30 @@ elemLoop !r !w (Bytes arr@(ByteArray arr#) off@(I# off#) len) = case len of
-- | Take bytes while the predicate is true.
takeWhile :: (Word8 -> Bool) -> Bytes -> Bytes
{-# INLINE takeWhile #-}
takeWhile k b = Pure.unsafeTake (countWhile k b) b
takeWhile k b = Pure.unsafeTake (Pure.countWhile k b) b

-- | Drop bytes while the predicate is true.
dropWhile :: (Word8 -> Bool) -> Bytes -> Bytes
{-# INLINE dropWhile #-}
dropWhile k b = Pure.unsafeDrop (countWhile k b) b
dropWhile k b = Pure.unsafeDrop (Pure.countWhile k b) b

{- | /O(n)/ 'dropWhileEnd' @p@ @b@ returns the prefix remaining after
dropping characters that satisfy the predicate @p@ from the end of
@t@.
-}
dropWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes
{-# INLINE dropWhileEnd #-}
dropWhileEnd k !b = Pure.unsafeTake (length b - countWhileEnd k b) b
dropWhileEnd k !b = Pure.unsafeTake (length b - Pure.countWhileEnd k b) b

{- | /O(n)/ 'takeWhileEnd' @p@ @b@ returns the longest suffix of
elements that satisfy predicate @p@.
-}
takeWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes
{-# INLINE takeWhileEnd #-}
takeWhileEnd k !b =
let n = countWhileEnd k b
let n = Pure.countWhileEnd k b
in Bytes (array b) (offset b + length b - n) n

-- Internal. The returns the number of bytes that match the
-- predicate until the first non-match occurs. If all bytes
-- match the predicate, this will return the length originally
-- provided.
countWhile :: (Word8 -> Bool) -> Bytes -> Int
{-# INLINE countWhile #-}
countWhile k (Bytes arr off0 len0) = go off0 len0 0
where
go !off !len !n =
if len > 0
then
if k (PM.indexByteArray arr off)
then go (off + 1) (len - 1) (n + 1)
else n
else n

-- Internal. Variant of countWhile that starts from the end
-- of the string instead of the beginning.
countWhileEnd :: (Word8 -> Bool) -> Bytes -> Int
{-# INLINE countWhileEnd #-}
countWhileEnd k (Bytes arr off0 len0) = go (off0 + len0 - 1) (len0 - 1) 0
where
go !off !len !n =
if len >= 0
then
if k (PM.indexByteArray arr off)
then go (off - 1) (len - 1) (n + 1)
else n
else n

{- | Convert a 'String' consisting of only characters in the ASCII block
to a byte sequence. Any character with a codepoint above @U+007F@ is
replaced by @U+0000@.
Expand Down Expand Up @@ -694,16 +663,6 @@ intercalateByte2 !sep !a !b =
where
len = length a + length b + 1

-- | /O(n)/ Returns true if any byte in the sequence satisfies the predicate.
any :: (Word8 -> Bool) -> Bytes -> Bool
{-# INLINE any #-}
any f = foldr (\b r -> f b || r) False

-- | /O(n)/ Returns true if all bytes in the sequence satisfy the predicate.
all :: (Word8 -> Bool) -> Bytes -> Bool
{-# INLINE all #-}
all f = foldr (\b r -> f b && r) True

{- | Variant of 'toShortByteString' that unconditionally makes a copy of
the array backing the sliced 'Bytes' even if the original array
could be reused. Prefer 'toShortByteString'.
Expand Down Expand Up @@ -786,52 +745,3 @@ withLengthU !arr f =
Nat.with
(PM.sizeofByteArray arr)
(\n -> f n (ByteArrayN arr))

findTetragramIndex ::
Word8 ->
Word8 ->
Word8 ->
Word8 ->
Bytes ->
Maybe Int
findTetragramIndex !w0 !w1 !w2 !w3 (Bytes arr off len) =
if len < 4
then Nothing
else
let !target =
unsafeShiftL (fromIntegral w0 :: Word32) 24
.|. unsafeShiftL (fromIntegral w1 :: Word32) 16
.|. unsafeShiftL (fromIntegral w2 :: Word32) 8
.|. unsafeShiftL (fromIntegral w3 :: Word32) 0
!end = off + len
go !ix !acc =
if acc == target
then
let n = ix - off
in Just (n - 4)
else
if ix < end
then
let !w = PM.indexByteArray arr ix :: Word8
acc' =
(fromIntegral w :: Word32)
.|. unsafeShiftL acc 8
in go (ix + 1) acc'
else Nothing
!acc0 =
unsafeShiftL (fromIntegral (PM.indexByteArray arr 0 :: Word8) :: Word32) 24
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 1 :: Word8) :: Word32) 16
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 2 :: Word8) :: Word32) 8
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 3 :: Word8) :: Word32) 0
in go 4 acc0

splitTetragram1 ::
Word8 ->
Word8 ->
Word8 ->
Word8 ->
Bytes ->
Maybe (Bytes, Bytes)
splitTetragram1 !w0 !w1 !w2 !w3 !b = case findTetragramIndex w0 w1 w2 w3 b of
Nothing -> Nothing
Just n -> Just (Pure.unsafeTake n b, Pure.unsafeDrop (n + 4) b)
99 changes: 98 additions & 1 deletion src/Data/Bytes/Pure.hs
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,19 @@ module Data.Bytes.Pure
, toShortByteString
, replicate
, replicateU
, splitTetragram1
, findTetragramIndex
, countWhile
, countWhileEnd
, any
, all
) where

import Prelude hiding (Foldable (..), map, replicate)
import Prelude hiding (Foldable (..), map, replicate, any, all)

import Control.Monad.Primitive (PrimMonad, PrimState)
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bits (unsafeShiftL, (.|.))
import Data.Bits (xor)
import Data.ByteString (ByteString)
import Data.ByteString.Short.Internal (ShortByteString (SBS))
Expand Down Expand Up @@ -402,3 +409,93 @@ replicateU !n !w = runByteArrayST do
arr <- PM.newByteArray n
PM.setByteArray arr 0 n w
PM.unsafeFreezeByteArray arr

splitTetragram1 ::
Word8 ->
Word8 ->
Word8 ->
Word8 ->
Bytes ->
Maybe (Bytes, Bytes)
splitTetragram1 !w0 !w1 !w2 !w3 !b = case findTetragramIndex w0 w1 w2 w3 b of
Nothing -> Nothing
Just n -> Just (unsafeTake n b, unsafeDrop (n + 4) b)

findTetragramIndex ::
Word8 ->
Word8 ->
Word8 ->
Word8 ->
Bytes ->
Maybe Int
findTetragramIndex !w0 !w1 !w2 !w3 (Bytes arr off len) =
if len < 4
then Nothing
else
let !target =
unsafeShiftL (fromIntegral w0 :: Word32) 24
.|. unsafeShiftL (fromIntegral w1 :: Word32) 16
.|. unsafeShiftL (fromIntegral w2 :: Word32) 8
.|. unsafeShiftL (fromIntegral w3 :: Word32) 0
!end = off + len
go !ix !acc =
if acc == target
then
let n = ix - off
in Just (n - 4)
else
if ix < end
then
let !w = PM.indexByteArray arr ix :: Word8
acc' =
(fromIntegral w :: Word32)
.|. unsafeShiftL acc 8
in go (ix + 1) acc'
else Nothing
!acc0 =
unsafeShiftL (fromIntegral (PM.indexByteArray arr 0 :: Word8) :: Word32) 24
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 1 :: Word8) :: Word32) 16
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 2 :: Word8) :: Word32) 8
.|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 3 :: Word8) :: Word32) 0
in go 4 acc0

-- Internal. The returns the number of bytes that match the
-- predicate until the first non-match occurs. If all bytes
-- match the predicate, this will return the length originally
-- provided.
countWhile :: (Word8 -> Bool) -> Bytes -> Int
{-# INLINE countWhile #-}
countWhile k (Bytes arr off0 len0) = go off0 len0 0
where
go !off !len !n =
if len > 0
then
if k (PM.indexByteArray arr off)
then go (off + 1) (len - 1) (n + 1)
else n
else n

-- Internal. Variant of countWhile that starts from the end
-- of the string instead of the beginning.
countWhileEnd :: (Word8 -> Bool) -> Bytes -> Int
{-# INLINE countWhileEnd #-}
countWhileEnd k (Bytes arr off0 len0) = go (off0 + len0 - 1) (len0 - 1) 0
where
go !off !len !n =
if len >= 0
then
if k (PM.indexByteArray arr off)
then go (off - 1) (len - 1) (n + 1)
else n
else n

-- | /O(n)/ Returns true if any byte in the sequence satisfies the predicate.
any :: (Word8 -> Bool) -> Bytes -> Bool
{-# INLINE any #-}
any f = foldr (\b r -> f b || r) False

-- | /O(n)/ Returns true if all bytes in the sequence satisfy the predicate.
all :: (Word8 -> Bool) -> Bytes -> Bool
{-# INLINE all #-}
all f = foldr (\b r -> f b && r) True

Loading

0 comments on commit eddd468

Please sign in to comment.