From 43c74f9d92a3a5ab388cfaccef228382b057f9bb Mon Sep 17 00:00:00 2001 From: Matthew Craven Date: Fri, 12 Sep 2025 15:03:48 -0400 Subject: [PATCH 1/6] Add a few basic tests for unboxing and RULES working This uses tasty-inspection-testing, which is a plugin and so introduces a circular dependency: bytestring -> plugin -> ghc -> bytestring This is worked around with an ugly but simple hack: Guard the part of the testsuite that uses the plugin behind a flag, and change the name of the (local) package before trying to build the testsuite with that flag enabled. --- .github/workflows/ci.yml | 23 ++++++++- bytestring.cabal | 25 ++++++++- run-plugin-tests.sh | 11 ++++ tests/Main.hs | 8 +++ tests/PluginTests.hs | 98 ++++++++++++++++++++++++++++++++++++ tests/PluginTests/Splices.hs | 45 +++++++++++++++++ 6 files changed, 207 insertions(+), 3 deletions(-) create mode 100644 run-plugin-tests.sh create mode 100644 tests/PluginTests.hs create mode 100644 tests/PluginTests/Splices.hs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b683dfc40..315bbfe55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -158,7 +158,7 @@ jobs: path: | ${{ steps.setup-haskell-cabal.outputs.cabal-store }} dist-newstyle - key: ${{ runner.os }}-latest + key: ${{ runner.os }}-latest-bounds-checking - name: Test run: cabal test --ghc-options='-fcheck-prim-bounds -fno-ignore-asserts -DHS_BYTESTRING_ASSERTIONS' @@ -183,6 +183,27 @@ jobs: - name: Test run: cabal test -fpure-haskell --ghc-options=-fno-ignore-asserts --enable-tests --test-show-details=direct all + inspection-testing: + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: haskell-actions/setup@v2 + id: setup-haskell-cabal + with: + ghc-version: 'latest' + - name: Update cabal package database + run: cabal update + - uses: actions/cache@v3 + name: Cache cabal stuff + with: + path: | + ${{ steps.setup-haskell-cabal.outputs.cabal-store }} + dist-newstyle + key: ${{ runner.os }}-latest-inspection-testing + - name: Test + run: sh run-plugin-tests.sh + i386: needs: build runs-on: ubuntu-latest diff --git a/bytestring.cabal b/bytestring.cabal index dcdd35a0c..35e6e5ac1 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -80,6 +80,14 @@ Flag pure-haskell default: False manual: True +Flag plugin-tests + description: Also build and run the tests that use plugins. + + (Due to circular dependencies, these tests currently + cannot be built without renaming the library.) + default: False + manual: True + source-repository head type: git location: https://github.com/haskell/bytestring @@ -216,16 +224,27 @@ test-suite bytestring-tests QuickCheckUtils hs-source-dirs: tests, tests/builder + build-depends: bytestring + -- Keep 'bytestring' on the same line as 'build-depends:' + -- this is used by our hack to allow plugin tests build-depends: base, - bytestring, deepseq, QuickCheck, tasty, + tasty-expected-failure ^>= 0.12.3, tasty-quickcheck >= 0.8.1, template-haskell, transformers >= 0.3, syb + if flag(plugin-tests) + cpp-options: -DBYTESTRING_PLUGIN_TESTS=1 + build-depends: tasty-inspection-testing ^>= 0.2.1 + other-modules: PluginTests + PluginTests.Splices + else + cpp-options: -DBYTESTRING_PLUGIN_TESTS=0 + ghc-options: -fwarn-unused-binds -rtsopts if !arch(wasm32) @@ -249,8 +268,10 @@ benchmark bytestring-bench ghc-options: -O2 "-with-rtsopts=-A32m" -fproc-alignment=64 + build-depends: bytestring + -- Keep 'bytestring' on the same line as 'build-depends:' + -- this is used by our hack to allow plugin tests build-depends: base, - bytestring, deepseq, tasty-bench, random diff --git a/run-plugin-tests.sh b/run-plugin-tests.sh new file mode 100644 index 000000000..7720d1bd9 --- /dev/null +++ b/run-plugin-tests.sh @@ -0,0 +1,11 @@ +# small script to hackily work around the dependency cycle +# 'bytestring -> [plugin] -> ghc -> bytestring' that prevents +# the testsuite from using plugins, by renaming the library +# to 'bytestring-plugins-hack' + +sed -E '/Name:|build-depends:/s/bytestring/bytestring-plugins-hack/' bytestring.cabal > bytestring-plugins-hack.cabal + +mv bytestring.cabal bytestring.cabal.__BACKUP__ +cabal test -fplugin-tests --test-show-details=direct "$@" +mv bytestring.cabal.__BACKUP__ bytestring.cabal +rm bytestring-plugins-hack.cabal diff --git a/tests/Main.hs b/tests/Main.hs index 043b5c415..b519e086c 100644 --- a/tests/Main.hs +++ b/tests/Main.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE CPP #-} + module Main (main) where import Test.Tasty @@ -7,6 +9,9 @@ import qualified IsValidUtf8 import qualified LazyHClose import qualified Lift import qualified Properties +#if BYTESTRING_PLUGIN_TESTS +import qualified PluginTests +#endif main :: IO () main = defaultMain $ testGroup "All" @@ -15,4 +20,7 @@ main = defaultMain $ testGroup "All" , LazyHClose.testSuite , Lift.testSuite , Properties.testSuite +#if BYTESTRING_PLUGIN_TESTS + , PluginTests.testSuite +#endif ] diff --git a/tests/PluginTests.hs b/tests/PluginTests.hs new file mode 100644 index 000000000..70ceec904 --- /dev/null +++ b/tests/PluginTests.hs @@ -0,0 +1,98 @@ +{-# LANGUAGE TemplateHaskell #-} + +{-# OPTIONS_GHC -O -dsuppress-all -dno-suppress-type-signatures -fplugin=Test.Tasty.Inspection.Plugin #-} + +module PluginTests (testSuite) where + +import qualified Data.ByteString as S +import qualified Data.ByteString.Char8 as S8 +import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString.Lazy.Char8 as BL8 +import Data.ByteString.Builder +import Data.Word + +import Test.Tasty +import Test.Tasty.ExpectedFailure +import Test.Tasty.Inspection + +import PluginTests.Splices + +testSuite :: TestTree +testSuite = testGroup "Inspection plugin tests" + [ testGroup "Literals" $ + [ testGroup "StrictByteString" + [ $(hasNoStringyStuff 'pack_strict_foo) + , $(inspectTest $ 'len_pack_strict_foo === 'literal_three) + , $(hasNoStringyStuff 'pack_strict_literal) + , $(inspectTest $ 'len_pack_strict_literal === 'literal_thirtyOne) + , expectFail $ $(hasNoStringyStuff 'pack_strict_nonAscii) + , expectFail $ $(hasNoStringyStuff 'pack_strict_literal_nonAscii) + ] + + , testGroup "Builder" + [ $(hasNoStringyStuff 'builder_string8_foo) + , $(hasNoStringyStuff 'builder_string8_literal) + , $(hasNoStringyStuff 'builder_stringUtf8_foo) + , $(hasNoStringyStuff 'builder_stringUtf8_literal) + , $(hasNoStringyStuff 'builder_stringUtf8_nonAscii) + , $(hasNoStringyStuff 'builder_stringUtf8_literal_nonAscii) + ] + ] + + , $(inspectTest $ 'append_pack_replicate_unboxing `hasNoType` ''S.ByteString) + ] + +foo_string_literal :: [Char] +foo_string_literal = "foo" + +unicode_string_literal :: [Char] +unicode_string_literal = "\0example\0 ... \xff \x1f530" + +pack_strict_foo :: S.ByteString +pack_strict_foo = S8.pack foo_string_literal + +len_pack_strict_foo :: Int +len_pack_strict_foo = S.length pack_strict_foo + +pack_strict_literal :: S.ByteString +pack_strict_literal = S8.pack "some ascii literal of length 31" + +len_pack_strict_literal :: Int +len_pack_strict_literal = S.length pack_strict_literal + +pack_strict_nonAscii :: S.ByteString +pack_strict_nonAscii = S8.pack unicode_string_literal + +pack_strict_literal_nonAscii :: S.ByteString +pack_strict_literal_nonAscii + = S8.pack "this\0literal contains\x80\xf0\xff non-ascii characters" + +literal_three :: Int +literal_three = 3 + +literal_thirtyOne :: Int +literal_thirtyOne = 31 + +builder_string8_foo :: Builder +builder_string8_foo = string8 foo_string_literal + +builder_string8_literal :: Builder +builder_string8_literal = string8 "some ascii string literal" + +builder_stringUtf8_foo :: Builder +builder_stringUtf8_foo = stringUtf8 foo_string_literal + +builder_stringUtf8_literal :: Builder +builder_stringUtf8_literal = stringUtf8 "some other ascii string literal" + +builder_stringUtf8_nonAscii :: Builder +builder_stringUtf8_nonAscii = stringUtf8 unicode_string_literal + +builder_stringUtf8_literal_nonAscii :: Builder +builder_stringUtf8_literal_nonAscii + = stringUtf8 "inline literal string containing \0special\0 and non-ASCII characters like \x2139" + +append_pack_replicate_unboxing :: Int -> Word8 -> Int +append_pack_replicate_unboxing n c + = S.count c $ S.append (S.pack [0..c]) (S.replicate n c) + diff --git a/tests/PluginTests/Splices.hs b/tests/PluginTests/Splices.hs new file mode 100644 index 000000000..7cd30f5a0 --- /dev/null +++ b/tests/PluginTests/Splices.hs @@ -0,0 +1,45 @@ +{-# LANGUAGE TemplateHaskellQuotes #-} + +module PluginTests.Splices where + +import Test.Tasty +import Test.Tasty.Inspection +import Language.Haskell.TH + +import GHC.Base + ( unpackCString# + , unpackAppendCString# + , unpackCStringUtf8# + , unpackAppendCStringUtf8# + , unpackNBytes# + , unpackFoldrCString# + , unpackFoldrCStringUtf8# + ) +import Language.Haskell.TH (Name) + +unpackCString_functions_without_foldr :: [Name] +unpackCString_functions_without_foldr = + [ 'unpackCString# + , 'unpackAppendCString# + , 'unpackCStringUtf8# + , 'unpackAppendCStringUtf8# + , 'unpackNBytes# + ] + +unpackCString_functions_all :: [Name] +unpackCString_functions_all = + [ 'unpackFoldrCString# + , 'unpackFoldrCStringUtf8# + ] ++ unpackCString_functions_without_foldr + +hasNoStringyStuff :: Name -> Q Exp +hasNoStringyStuff n = flip inspectObligations n + [ (`hasNoTypes` [''Char, ''[]]) + , (`doesNotUseAnyOf` unpackCString_functions_all) + ] + +hasNoStringyStuffExceptFolds :: Name -> Q Exp +hasNoStringyStuffExceptFolds n = flip inspectObligations n + [ (`hasNoTypes` [''Char, ''[]]) + , (`doesNotUseAnyOf` unpackCString_functions_without_foldr) + ] From dbfbc9c15c23e9364f31daa162c1280b4bdc35df Mon Sep 17 00:00:00 2001 From: Matthew Craven Date: Sat, 13 Sep 2025 20:16:11 -0400 Subject: [PATCH 2/6] Replace 'plugin-tests' flag with more sed hackery --- bytestring.cabal | 12 +++--------- run-plugin-tests.sh | 11 +++++++---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/bytestring.cabal b/bytestring.cabal index 35e6e5ac1..62abbb4b1 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -80,14 +80,6 @@ Flag pure-haskell default: False manual: True -Flag plugin-tests - description: Also build and run the tests that use plugins. - - (Due to circular dependencies, these tests currently - cannot be built without renaming the library.) - default: False - manual: True - source-repository head type: git location: https://github.com/haskell/bytestring @@ -237,7 +229,9 @@ test-suite bytestring-tests transformers >= 0.3, syb - if flag(plugin-tests) + -- The following intentionally-funnily-spelled condition + -- is changed to 'true' by our hack to allow plugin-based tests + if false && impl(pluginTestsHack) cpp-options: -DBYTESTRING_PLUGIN_TESTS=1 build-depends: tasty-inspection-testing ^>= 0.2.1 other-modules: PluginTests diff --git a/run-plugin-tests.sh b/run-plugin-tests.sh index 7720d1bd9..4f1bc306e 100644 --- a/run-plugin-tests.sh +++ b/run-plugin-tests.sh @@ -3,9 +3,12 @@ # the testsuite from using plugins, by renaming the library # to 'bytestring-plugins-hack' -sed -E '/Name:|build-depends:/s/bytestring/bytestring-plugins-hack/' bytestring.cabal > bytestring-plugins-hack.cabal +sed -E ' + /Name:|build-depends:/s/bytestring/bytestring-plugins-hack/ ; + s/if false && impl\(pluginTestsHack\)/if true/' \ + bytestring.cabal > bytestring-plugins-hack.cabal -mv bytestring.cabal bytestring.cabal.__BACKUP__ -cabal test -fplugin-tests --test-show-details=direct "$@" -mv bytestring.cabal.__BACKUP__ bytestring.cabal +mv bytestring.cabal bytestring.cabal.__MOVED_DURING_PLUGIN_TESTS__ +cabal test --test-show-details=direct "$@" +mv bytestring.cabal.__MOVED_DURING_PLUGIN_TESTS__ bytestring.cabal rm bytestring-plugins-hack.cabal From 62d6da3604c5160be10da7190a33bebbe19a4d8e Mon Sep 17 00:00:00 2001 From: Matthew Craven Date: Sat, 13 Sep 2025 20:18:42 -0400 Subject: [PATCH 3/6] Fix emulated CI jobs --- .github/workflows/ci.yml | 2 +- bytestring.cabal | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 315bbfe55..92b8d4138 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,7 +138,7 @@ jobs: apt-get install -y ghc libghc-tasty-quickcheck-dev libghc-syb-dev run: | ghc --version - ghc --make -fPIC -XHaskell2010 -XBangPatterns -XDeriveDataTypeable -XDeriveGeneric -XDeriveLift -XFlexibleContexts -XFlexibleInstances -XLambdaCase -XMagicHash -XMultiWayIf -XNamedFieldPuns -XPatternSynonyms -XRankNTypes -XScopedTypeVariables -XStandaloneDeriving -XTupleSections -XTypeApplications -XTypeOperators -XUnboxedTuples -optP-Wall -optP-Werror=undef -DPURE_HASKELL=0 -Iinclude -itests:tests/builder -o Main cbits/*.c tests/Main.hs +RTS -s + ghc --make -fPIC -XHaskell2010 -XBangPatterns -XDeriveDataTypeable -XDeriveGeneric -XDeriveLift -XFlexibleContexts -XFlexibleInstances -XLambdaCase -XMagicHash -XMultiWayIf -XNamedFieldPuns -XPatternSynonyms -XRankNTypes -XScopedTypeVariables -XStandaloneDeriving -XTupleSections -XTypeApplications -XTypeOperators -XUnboxedTuples -optP-Wall -optP-Werror=undef -DPURE_HASKELL=0 -DBYTESTRING_PLUGIN_TESTS=0 -Iinclude -itests:tests/builder -o Main cbits/*.c tests/Main.hs +RTS -s ./Main +RTS -s bounds-checking: diff --git a/bytestring.cabal b/bytestring.cabal index 62abbb4b1..06305d649 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -223,7 +223,6 @@ test-suite bytestring-tests deepseq, QuickCheck, tasty, - tasty-expected-failure ^>= 0.12.3, tasty-quickcheck >= 0.8.1, template-haskell, transformers >= 0.3, @@ -233,7 +232,8 @@ test-suite bytestring-tests -- is changed to 'true' by our hack to allow plugin-based tests if false && impl(pluginTestsHack) cpp-options: -DBYTESTRING_PLUGIN_TESTS=1 - build-depends: tasty-inspection-testing ^>= 0.2.1 + build-depends: tasty-inspection-testing ^>= 0.2.1, + tasty-expected-failure ^>= 0.12.3 other-modules: PluginTests PluginTests.Splices else From af12adcb35d8171618b7b56a9375a820442042ee Mon Sep 17 00:00:00 2001 From: Matthew Craven Date: Sat, 13 Sep 2025 20:18:57 -0400 Subject: [PATCH 4/6] fiddle with wording --- bytestring.cabal | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bytestring.cabal b/bytestring.cabal index 06305d649..09879788c 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -218,7 +218,7 @@ test-suite bytestring-tests tests/builder build-depends: bytestring -- Keep 'bytestring' on the same line as 'build-depends:' - -- this is used by our hack to allow plugin tests + -- this is used by our hack to allow plugin-based tests build-depends: base, deepseq, QuickCheck, @@ -264,7 +264,7 @@ benchmark bytestring-bench -fproc-alignment=64 build-depends: bytestring -- Keep 'bytestring' on the same line as 'build-depends:' - -- this is used by our hack to allow plugin tests + -- this is used by our hack to allow plugin-based tests build-depends: base, deepseq, tasty-bench, From 1f4666e2ccfa6c28eb8dc97b1d5d7cb4ad609170 Mon Sep 17 00:00:00 2001 From: Viktor Dukhovni Date: Thu, 12 Jan 2023 21:59:41 -0500 Subject: [PATCH 5/6] Avoid per-byte loop in cstring{,Utf8} builders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copy chunks of the input to the output buffer with, up to the shorter of the available buffer space and the "null-free" portion of the remaining string. Actually "null-free" here means not containing any denormalised two-byte encodings starting with 0xC0 (so possibly also other ASCII bytes if the UTF-8 encoding is oddball). This substantially improves performance, with just one "15%" increase that looks like a spurious measurement error (perhaps code layout difference artefact). UTF-8 String (12B): OK 16.7 ns ± 1.3 ns, 60% less than baseline UTF-8 String (64B, one null): OK 22.6 ns ± 1.3 ns, 87% less than baseline UTF-8 String (64B, one null, no shared work): OK 30.1 ns ± 2.6 ns, 83% less than baseline UTF-8 String (64B, half nulls): OK 92.6 ns ± 5.3 ns, 49% less than baseline UTF-8 String (64B, all nulls): OK 76.3 ns ± 4.5 ns, 57% less than baseline UTF-8 String (64B, all nulls, no shared work): OK 82.3 ns ± 5.6 ns, 54% less than baseline ASCII String (12B): OK 6.50 ns ± 326 ps, 76% less than baseline ASCII String (64B): OK 8.03 ns ± 334 ps, 94% less than baseline AsciiLit: OK 8.02 ns ± 648 ps, 94% less than baseline Utf8Lit: OK 21.8 ns ± 1.3 ns, 88% less than baseline strLit: OK 8.90 ns ± 788 ps, 94% less than baseline stringUtf8: OK 22.4 ns ± 1.3 ns, 87% less than baseline strLitInline: OK 8.26 ns ± 676 ps, 94% less than baseline utf8LitInline: OK 23.2 ns ± 1.3 ns, 87% less than baseline foldMap byteStringInsert (10000): OK 46.0 μs ± 4.0 μs, 15% less than baseline --> lazyByteStringHex (10000): OK --> 4.74 μs ± 337 ns, 15% more than baseline foldMap integerDec (small) (10000): OK 205 μs ± 12 μs, 9% less than baseline char8 (10000): OK 2.58 μs ± 234 ns, 30% less than baseline foldMap (left-assoc) (10000): OK 73.2 μs ± 2.9 μs, 54% less than baseline foldMap (right-assoc) (10000): OK 43.0 μs ± 4.2 μs, 65% less than baseline foldMap [manually fused, left-assoc] (10000): OK 81.4 μs ± 5.3 μs, 48% less than baseline foldMap [manually fused, right-assoc] (10000): OK 47.3 μs ± 785 ns, 61% less than baseline --- Data/ByteString/Builder/Internal.hs | 110 +++++++++++++++++- Data/ByteString/Builder/Prim.hs | 77 +++++------- bench/BenchAll.hs | 6 + .../Data/ByteString/Builder/Prim/Tests.hs | 22 +++- 4 files changed, 160 insertions(+), 55 deletions(-) diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs index 8bb6278bd..a699a545c 100644 --- a/Data/ByteString/Builder/Internal.hs +++ b/Data/ByteString/Builder/Internal.hs @@ -1,6 +1,8 @@ -{-# LANGUAGE Unsafe #-} -{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE MagicHash #-} {-# LANGUAGE NoMonoLocalBinds #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE Unsafe #-} +{-# LANGUAGE ViewPatterns #-} {-# OPTIONS_HADDOCK not-home #-} @@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal ( -- , sizedChunksInsert , byteStringCopy + , asciiLiteralCopy + , modUtf8LitCopy , byteStringInsert , byteStringThreshold @@ -816,6 +820,7 @@ ensureFree :: Int -> Builder ensureFree minFree = builder step where + step :: forall r. BuildStep r -> BuildStep r step k br@(BufferRange op ope) | ope `minusPtr` op < minFree = return $ bufferFull minFree op k | otherwise = k br @@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k = where outRemaining = ope `minusPtr` op +-- | Copy the bytes from a 'BufferRange' into the output stream. +wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'. + -> BuildStep a -> BuildStep a +wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k = + go ip0 + where + go !ip (BufferRange op ope) + | inpRemaining <= outRemaining = do + copyBytes op ip inpRemaining + let !br' = BufferRange (op `plusPtr` inpRemaining) ope + k br' + | otherwise = do + copyBytes op ip outRemaining + let !ip' = ip `plusPtr` outRemaining + return $ bufferFull 1 ope (go ip') + where + outRemaining = ope `minusPtr` op + inpRemaining = ipe `minusPtr` ip + -- Strict ByteStrings ------------------------------------------------------------------------------ @@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder byteStringThreshold maxCopySize = \bs -> builder $ step bs where + step :: forall r. S.ByteString -> BuildStep r -> BuildStep r step bs@(S.BS _ len) k br@(BufferRange !op _) | len <= maxCopySize = byteStringCopyStep bs k br | otherwise = return $ insertChunk op bs k @@ -949,6 +974,87 @@ byteStringInsert :: S.StrictByteString -> Builder byteStringInsert = \bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k + +------------------------------------------------------------------------------ +-- Raw CString encoding +------------------------------------------------------------------------------ + +-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII +-- strings that are free of embedded (overlong-encoded as the two-byte sequence +-- @0xC0 0x80@) null characters. +-- +-- @since 0.11.5.0 +{-# INLINABLE asciiLiteralCopy #-} +asciiLiteralCopy :: Ptr Word8 -> Int -> Builder +asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) -> + if len <= ope `minusPtr` op + then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope) + else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br + +-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings +-- that may contain embedded overlong two-byte encodings of the NUL character +-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the +-- result is not well defined. +-- +-- @since 0.11.5.0 +{-# INLINABLE modUtf8LitCopy #-} +modUtf8LitCopy :: Ptr Word8 -> Int -> Builder +modUtf8LitCopy !ip !len + | len > 0 = builder (modUtf8_step ip len) + | otherwise = builder id + +-- | Copy a /non-empty/ UTF-8 input possibly containing denormalised 2-octet +-- sequences. While only the NUL byte should ever encoded that way (as @0xC0 +-- 80@), this handles other denormalised @0xC0 0x??@ sequences by keeping the +-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the the +-- result may not be what the user expected. +-- +modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r +modUtf8_step !ip !len k (BufferRange op ope) + | op == ope = return $ bufferFull 1 op (modUtf8_step ip len k) + | otherwise = do + let !avail = ope `minusPtr` op + !usable = avail `min` len + -- null-termination makes it possible to read one more byte than the + -- nominal input length, with any unexpected 0xC000 ending interpreted + -- as a NUL. More typically, this simplifies hanlding of inputs where + -- 0xC0 0x80 might otherwise be split across the "usable" input window. + !ch <- peekElemOff ip (usable - 1) + let !use | ch /= 0xC0 = usable + | otherwise = usable + 1 + !n <- utf8_copyBytes (ip `plusPtr` use) ip op + let !op' = op `plusPtr` n + !len' = len - use + ip' = ip `plusPtr` use + if | len' <= 0 -> k (BufferRange op' ope) + | op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope) + | otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k) + +-- | Consume the supplied input returning the number of bytes written +utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int +utf8_copyBytes !ipe = \ ip op -> go 0 ip op + where + go !n !ip@((< ipe) -> True) !op = do + !ch <- peek ip + let !ip' = ip `plusPtr` 1 + !op' = op `plusPtr` 1 + if | ch /= 0xC0 -> do + poke op ch + let !cnt = ipe `minusPtr` ip' + !runend <- S.memchr ip' 0xC0 (fromIntegral cnt) + let !runlen | runend == nullPtr = cnt + | otherwise = runend `minusPtr` ip' + if (runlen == 0) + then go (n + 1) ip' op' + else do + copyBytes op' ip' runlen + go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen) + | otherwise -> do + !ch' <- peek ip' + poke op (ch' .&. 0x3f) + go (n + 1) (ip' `plusPtr` 1) op' + go !n _ _ = pure n + -- Short bytestrings ------------------------------------------------------------------------------ diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index 82f5d18a9..acb3eb69b 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -1,5 +1,8 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE Trustworthy #-} +#include "bytestring-cpp-macros.h" + {- | Copyright : (c) 2010-2011 Simon Meier (c) 2010 Jasper van der Jeugt License : BSD3-style (see LICENSE) @@ -463,10 +466,11 @@ import Data.ByteString.Builder.Prim.Binary import Data.ByteString.Builder.Prim.ASCII import Foreign +#if !MIN_VERSION_base(4,15,0) +import Foreign.C (CSize(..), CString) +#endif import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) -import GHC.Word (Word8 (..)) import GHC.Exts -import GHC.IO ------------------------------------------------------------------------------ -- Creating Builders from bounded primitives @@ -658,59 +662,36 @@ primMapLazyByteStringBounded w = L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty ------------------------------------------------------------------------------- --- Raw CString encoding ------------------------------------------------------------------------------- - --- | A null-terminated ASCII encoded 'Foreign.C.String.CString'. --- Null characters are not representable. +-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII +-- strings that are free of embedded null characters. -- -- @since 0.11.0.0 cstring :: Addr# -> Builder -cstring = - \addr0 -> builder $ step addr0 - where - step :: Addr# -> BuildStep r -> BuildStep r - step !addr !k br@(BufferRange op0@(Ptr op0#) ope) - | W8# ch == 0 = k br - | op0 == ope = - return $ bufferFull 1 op0 (step addr k) - | otherwise = do - IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 1#) k br' - where - !ch = indexWord8OffAddr# addr 0# +cstring s = asciiLiteralCopy (Ptr s) (byteCountLiteral s) +{-# INLINE cstring #-} --- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'. --- Null characters can be encoded as @0xc0 0x80@. +-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8 +-- encoded strings in which any emebded null characters are represented via +-- the two-byte overlong-encoding: @0xC0 0x80@. -- -- @since 0.11.0.0 cstringUtf8 :: Addr# -> Builder -cstringUtf8 = - \addr0 -> builder $ step addr0 - where - step :: Addr# -> BuildStep r -> BuildStep r - step !addr !k br@(BufferRange op0@(Ptr op0#) ope) - | W8# ch == 0 = k br - | op0 == ope = - return $ bufferFull 1 op0 (step addr k) - -- NULL is encoded as 0xc0 0x80 - | W8# ch == 0xc0 - , W8# (indexWord8OffAddr# addr 1#) == 0x80 = do - let !(W8# nullByte#) = 0 - IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 2#) k br' - | otherwise = do - IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 1#) k br' - where - !ch = indexWord8OffAddr# addr 0# +cstringUtf8 s = modUtf8LitCopy (Ptr s) (byteCountLiteral s) +{-# INLINE cstringUtf8 #-} + +-- | Byte count of null-terminated primitive literal string excluding the +-- terminating null byte. +byteCountLiteral :: Addr# -> Int +byteCountLiteral addr# = +#if HS_cstringLength_AND_FinalPtr_AVAILABLE + I# (cstringLength# addr#) +#else + fromIntegral (pure_strlen (Ptr addr#)) + +foreign import ccall unsafe "string.h strlen" pure_strlen + :: CString -> CSize +#endif +{-# INLINE byteCountLiteral #-} ------------------------------------------------------------------------------ -- Char8 encoding diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 7f95a3e6f..845365aad 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -327,6 +327,12 @@ main = do , benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#) , benchB' "ASCII String (64B, naive)" asciiStr fromString , benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf + , benchB'_ "AsciiLit" $ asciiLit asciiBuf + , benchB'_ "Utf8Lit" $ utf8Lit utf8Buf + , benchB'_ "strLit" $ string8 asciiStr + , benchB'_ "stringUtf8" $ stringUtf8 utf8Str + , benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + , benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ] , bgroup "Encoding wrappers" diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index 60e0cf4a8..c9f5d5abd 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -14,6 +14,7 @@ import Data.Char (ord) import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as LC import Data.ByteString.Builder +import Data.ByteString.Builder.Extra as BE import qualified Data.ByteString.Builder.Prim as BP import Data.ByteString.Builder.Prim.TestUtils @@ -22,17 +23,28 @@ import Test.Tasty.QuickCheck tests :: [TestTree] tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 - , testsCombinatorsB, [testCString, testCStringUtf8] ] + , testsCombinatorsB + , [ testCString + , testCStringUtf8 1 + , testCStringUtf8 6 + , testCStringUtf8 64 + ] + ] testCString :: TestTree testCString = testProperty "cstring" $ toLazyByteString (BP.cstring "hello world!"#) == LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!" -testCStringUtf8 :: TestTree -testCStringUtf8 = testProperty "cstringUtf8" $ - toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) == - LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!" +testCStringUtf8 :: Int -> TestTree +testCStringUtf8 sz = testProperty "cstringUtf8" $ + BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty + (BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) == + LC.pack "hello" `L.append` L.singleton 0x00 + `L.append` L.singleton 0x00 + `L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80" + `L.append` L.singleton 0x00 + `L.append` L.singleton 0x00 ------------------------------------------------------------------------------ -- Binary From d63b19885ab0a396b9e4014910e61c34e378416f Mon Sep 17 00:00:00 2001 From: Viktor Dukhovni Date: Wed, 27 Aug 2025 16:45:08 +1000 Subject: [PATCH 6/6] fixup! Avoid per-byte loop in cstring{,Utf8} builders Matthew Craven review fixes --- Data/ByteString/Builder/Internal.hs | 19 ++++++++++--------- Data/ByteString/Builder/Prim.hs | 25 +++---------------------- Data/ByteString/Internal/Type.hs | 13 +++++++++++++ bench/BenchAll.hs | 2 -- 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs index a699a545c..12862016d 100644 --- a/Data/ByteString/Builder/Internal.hs +++ b/Data/ByteString/Builder/Internal.hs @@ -1,7 +1,7 @@ -{-# LANGUAGE MagicHash #-} -{-# LANGUAGE NoMonoLocalBinds #-} -{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE Unsafe #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE NoMonoLocalBinds #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_HADDOCK not-home #-} @@ -979,11 +979,11 @@ byteStringInsert = -- Raw CString encoding ------------------------------------------------------------------------------ --- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII --- strings that are free of embedded (overlong-encoded as the two-byte sequence --- @0xC0 0x80@) null characters. +-- | Builder for raw pointers to static data of known length that will never be +-- moved or freed. (This is used with the static buffers GHC uses to implement +-- ASCII string literals that do not contain null characters.) -- --- @since 0.11.5.0 +-- @since 0.13.0.0 {-# INLINABLE asciiLiteralCopy #-} asciiLiteralCopy :: Ptr Word8 -> Int -> Builder asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) -> @@ -996,7 +996,7 @@ asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) -> -- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the -- result is not well defined. -- --- @since 0.11.5.0 +-- @since 0.13.0.0 {-# INLINABLE modUtf8LitCopy #-} modUtf8LitCopy :: Ptr Word8 -> Int -> Builder modUtf8LitCopy !ip !len @@ -1034,6 +1034,7 @@ modUtf8_step !ip !len k (BufferRange op ope) utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int utf8_copyBytes !ipe = \ ip op -> go 0 ip op where + go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int go !n !ip@((< ipe) -> True) !op = do !ch <- peek ip let !ip' = ip `plusPtr` 1 @@ -1041,7 +1042,7 @@ utf8_copyBytes !ipe = \ ip op -> go 0 ip op if | ch /= 0xC0 -> do poke op ch let !cnt = ipe `minusPtr` ip' - !runend <- S.memchr ip' 0xC0 (fromIntegral cnt) + !runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt) let !runlen | runend == nullPtr = cnt | otherwise = runend `minusPtr` ip' if (runlen == 0) diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index acb3eb69b..6f0cd09e5 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -1,8 +1,5 @@ -{-# LANGUAGE CPP #-} {-# LANGUAGE Trustworthy #-} -#include "bytestring-cpp-macros.h" - {- | Copyright : (c) 2010-2011 Simon Meier (c) 2010 Jasper van der Jeugt License : BSD3-style (see LICENSE) @@ -456,6 +453,7 @@ import Data.ByteString.Builder.Internal import qualified Data.ByteString as S import qualified Data.ByteString.Internal as S +import qualified Data.ByteString.Internal.Type as S import qualified Data.ByteString.Lazy.Internal as L import Data.Char (ord) @@ -466,9 +464,6 @@ import Data.ByteString.Builder.Prim.Binary import Data.ByteString.Builder.Prim.ASCII import Foreign -#if !MIN_VERSION_base(4,15,0) -import Foreign.C (CSize(..), CString) -#endif import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) import GHC.Exts @@ -667,7 +662,7 @@ primMapLazyByteStringBounded w = -- -- @since 0.11.0.0 cstring :: Addr# -> Builder -cstring s = asciiLiteralCopy (Ptr s) (byteCountLiteral s) +cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s) {-# INLINE cstring #-} -- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8 @@ -676,23 +671,9 @@ cstring s = asciiLiteralCopy (Ptr s) (byteCountLiteral s) -- -- @since 0.11.0.0 cstringUtf8 :: Addr# -> Builder -cstringUtf8 s = modUtf8LitCopy (Ptr s) (byteCountLiteral s) +cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s) {-# INLINE cstringUtf8 #-} --- | Byte count of null-terminated primitive literal string excluding the --- terminating null byte. -byteCountLiteral :: Addr# -> Int -byteCountLiteral addr# = -#if HS_cstringLength_AND_FinalPtr_AVAILABLE - I# (cstringLength# addr#) -#else - fromIntegral (pure_strlen (Ptr addr#)) - -foreign import ccall unsafe "string.h strlen" pure_strlen - :: CString -> CSize -#endif -{-# INLINE byteCountLiteral #-} - ------------------------------------------------------------------------------ -- Char8 encoding ------------------------------------------------------------------------------ diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs index 0927771e1..725b43ec2 100644 --- a/Data/ByteString/Internal/Type.hs +++ b/Data/ByteString/Internal/Type.hs @@ -44,6 +44,7 @@ module Data.ByteString.Internal.Type ( unsafePackAddress, unsafePackLenAddress, unsafePackLiteral, unsafePackLenLiteral, literalFromOctetString, literalFromHex, + byteCountLiteral, -- * Low level imperative construction empty, @@ -486,6 +487,18 @@ unsafePackLenAddress len addr# = do #endif {-# INLINE unsafePackLenAddress #-} +-- | Byte count of null-terminated primitive literal string excluding the +-- terminating null byte. +byteCountLiteral :: Addr# -> Int +byteCountLiteral addr# = +#if HS_cstringLength_AND_FinalPtr_AVAILABLE + I# (cstringLength# addr#) +#else + fromIntegral @CSize @Int $ + accursedUnutterablePerformIO (c_strlen (Ptr addr#)) +#endif +{-# INLINE byteCountLiteral #-} + -- | See 'unsafePackAddress'. This function has similar behavior. Prefer -- this function when the address in known to be an @Addr#@ literal. In -- that context, there is no need for the sequencing guarantees that 'IO' diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 845365aad..e81f0e8d2 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -327,8 +327,6 @@ main = do , benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#) , benchB' "ASCII String (64B, naive)" asciiStr fromString , benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf - , benchB'_ "AsciiLit" $ asciiLit asciiBuf - , benchB'_ "Utf8Lit" $ utf8Lit utf8Buf , benchB'_ "strLit" $ string8 asciiStr , benchB'_ "stringUtf8" $ stringUtf8 utf8Str , benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"