From d7394473aba5e191802879b8692819c30483667f Mon Sep 17 00:00:00 2001 From: Eiko Date: Sun, 8 Mar 2026 18:52:09 +0000 Subject: [PATCH 1/2] Improvements on readByteString(') by avoid extra allocation also: fuse (chr . fromIntegral) in readString (less alloc) --- src/DataFrame/IO/Parquet/Binary.hs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/DataFrame/IO/Parquet/Binary.hs b/src/DataFrame/IO/Parquet/Binary.hs index 5697d44a..543c5522 100644 --- a/src/DataFrame/IO/Parquet/Binary.hs +++ b/src/DataFrame/IO/Parquet/Binary.hs @@ -9,6 +9,10 @@ import Data.Char import Data.IORef import Data.Int import Data.Word +import qualified Foreign.Ptr as Foreign +import qualified Foreign.Storable as Foreign +import qualified Data.ByteString.Unsafe as BSU +import qualified Foreign.Marshal.Alloc as Foreign littleEndianWord32 :: BS.ByteString -> Word32 littleEndianWord32 bytes @@ -124,7 +128,7 @@ readInt32FromBuffer buf bufferPos = do readString :: BS.ByteString -> IORef Int -> IO String readString buf pos = do nameSize <- readVarIntFromBuffer @Int buf pos - map (chr . fromIntegral) <$> replicateM nameSize (readAndAdvance pos buf) + replicateM nameSize (chr . fromIntegral <$> readAndAdvance pos buf) readByteStringFromBytes :: BS.ByteString -> (BS.ByteString, BS.ByteString) readByteStringFromBytes xs = @@ -136,10 +140,23 @@ readByteStringFromBytes xs = readByteString :: BS.ByteString -> IORef Int -> IO BS.ByteString readByteString buf pos = do size <- readVarIntFromBuffer @Int buf pos - BS.pack <$> replicateM size (readAndAdvance pos buf) + fillByteStringByWord8 size (\_ -> readAndAdvance pos buf) readByteString' :: BS.ByteString -> Int64 -> IO BS.ByteString -readByteString' buf size = BS.pack <$> mapM (`readSingleByte` buf) [0 .. (size - 1)] +readByteString' buf size = + fillByteStringByWord8 (fromIntegral size) ((`readSingleByte` buf) . fromIntegral) + +-- | Allocate a fix-sized buffer, repeat the action on each index. +-- Fill it into the buffer to get a ByteString. +fillByteStringByWord8 :: Int -> (Int -> IO Word8) -> IO BS.ByteString +fillByteStringByWord8 size getChar = do + p <- Foreign.mallocBytes size :: IO (Foreign.Ptr Word8) + fill 0 p + BSU.unsafePackCStringFinalizer p size (Foreign.free p) + where fill i p + | i >= size = pure () + | otherwise = getChar i >>= Foreign.pokeByteOff p i >> fill (i+1) p +{-# INLINE fillByteStringByWord8 #-} readSingleByte :: Int64 -> BS.ByteString -> IO Word8 readSingleByte pos buffer = return $ BS.index buffer (fromIntegral pos) From a9fad8c6c6634755eb3a513fc327657b942c7f20 Mon Sep 17 00:00:00 2001 From: Eiko Date: Sun, 8 Mar 2026 19:10:39 +0000 Subject: [PATCH 2/2] Make fillByteStringByWord8 async exception safe --- src/DataFrame/IO/Parquet/Binary.hs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/DataFrame/IO/Parquet/Binary.hs b/src/DataFrame/IO/Parquet/Binary.hs index 543c5522..4f366dff 100644 --- a/src/DataFrame/IO/Parquet/Binary.hs +++ b/src/DataFrame/IO/Parquet/Binary.hs @@ -3,6 +3,7 @@ module DataFrame.IO.Parquet.Binary where import Control.Monad +import Control.Exception (bracketOnError) import Data.Bits import qualified Data.ByteString as BS import Data.Char @@ -146,16 +147,21 @@ readByteString' :: BS.ByteString -> Int64 -> IO BS.ByteString readByteString' buf size = fillByteStringByWord8 (fromIntegral size) ((`readSingleByte` buf) . fromIntegral) --- | Allocate a fix-sized buffer, repeat the action on each index. +-- | Allocate a fixed-size buffer, repeat the action on each index. -- Fill it into the buffer to get a ByteString. fillByteStringByWord8 :: Int -> (Int -> IO Word8) -> IO BS.ByteString -fillByteStringByWord8 size getChar = do - p <- Foreign.mallocBytes size :: IO (Foreign.Ptr Word8) - fill 0 p - BSU.unsafePackCStringFinalizer p size (Foreign.free p) +fillByteStringByWord8 size getByte = do + bracketOnError + (Foreign.mallocBytes size :: IO (Foreign.Ptr Word8)) + Foreign.free + -- ^ ensures p is freed if (IO Word8) throws. + (\p -> do + fill 0 p + BSU.unsafePackCStringFinalizer p size (Foreign.free p) + ) where fill i p | i >= size = pure () - | otherwise = getChar i >>= Foreign.pokeByteOff p i >> fill (i+1) p + | otherwise = getByte i >>= Foreign.pokeByteOff p i >> fill (i+1) p {-# INLINE fillByteStringByWord8 #-} readSingleByte :: Int64 -> BS.ByteString -> IO Word8