Skip to content

Commit

Permalink
Use a custom parsing monad instead of attoparsec.
Browse files Browse the repository at this point in the history
All decoding benchmarks show significant speedups after this change.
The biggest improvement is to decoding packed data which is 4-5x as fast
as before.  (See below for a full list of benchmark diffs.)

This parsing monad follows the approach of, e.g., the `store` and `persist`
packages.  It requires that all data be in a *strict* `ByteString`,
and uses simple pointer arithmetic internally to walk through its bytes.

This effectively works against #62 (streaming parsers) since it
needs to read all the input data before starting the parse.  However,
that issue has already existed since the beginning of this library for,
e.g., submessages; see that bug for more details.  So this change
doesn't appear to be a regression.  We also have freedom to later
try out different implementations without changing the API, since
`Parser` is opaque as of #294.

The implementation of Parser differs from `store` and `persist` by using
`ExceptT` to pass around errors internally, rather than exceptions (or
closures, as in `attoparsec`).  We may want to experiment with this later,
but in my initial experiments I didn't see a significant improvement
from those approaches.

Benchmark results (the "time" output from Criterion):

flat(602B)/decode/whnf:
   13.14 μs   (13.02 μs .. 13.29 μs)
=> 8.686 μs   (8.514 μs .. 8.873 μs)

nested(900B)/decode/whnf:
    26.35 μs   (25.85 μs .. 26.86 μs)
=>  14.01 μs   (13.86 μs .. 14.18 μs)

int32-packed(1003B)/decode/whnf:
   36.23 μs   (35.75 μs .. 36.69 μs)
=> 17.31 μs   (17.11 μs .. 17.50 μs)

int32-unpacked(2000B)/decode/whnf:
   65.18 μs   (64.19 μs .. 66.68 μs)
=> 19.35 μs   (19.13 μs .. 19.58 μs)

float-packed(4003B)/decode/whnf:
   78.61 μs   (77.53 μs .. 79.46 μs)
=> 19.56 μs   (19.40 μs .. 19.76 μs)

float-unpacked(5000B)/decode/whnf:
   108.9 μs   (107.8 μs .. 110.3 μs)
=> 22.29 μs   (22.00 μs .. 22.66 μs)

no-unused(10003B)/decode/whnf:
   571.7 μs   (560.0 μs .. 586.6 μs)
=> 356.5 μs   (349.0 μs .. 365.0 μs)

with-unused(10003B)/decode/whnf:
   786.6 μs   (697.8 μs .. 875.5 μs)
=> 368.3 μs   (361.8 μs .. 376.4 μs)
  • Loading branch information
judah committed Dec 21, 2018
1 parent 123e486 commit ea02c34
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 49 deletions.
28 changes: 12 additions & 16 deletions proto-lens/package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,10 @@ data-files:

library:
source-dirs: src
exposed-modules:
- Data.ProtoLens
- Data.ProtoLens.Combinators
- Data.ProtoLens.Default
- Data.ProtoLens.Encoding
- Data.ProtoLens.Encoding.Bytes
- Data.ProtoLens.Encoding.Wire
- Data.ProtoLens.Message
- Data.ProtoLens.Message.Enum
- Data.ProtoLens.Service.Types
- Data.ProtoLens.TextFormat
- Proto.Google.Protobuf.Compiler.Plugin
- Proto.Google.Protobuf.Compiler.Plugin_Fields
- Proto.Google.Protobuf.Descriptor
- Proto.Google.Protobuf.Descriptor_Fields

other-modules:
- Data.ProtoLens.TextFormat.Parser
dependencies:
- attoparsec == 0.13.*
- base >= 4.9 && < 4.13
- bytestring == 0.10.*
- containers >= 0.5 && < 0.7
Expand All @@ -60,3 +44,15 @@ library:
- text == 1.2.*
- transformers >= 0.4 && < 0.6
- void == 0.7.*

tests:
parser_test:
main: parser_test.hs
source-dirs: tests
dependencies:
- base
- bytestring
- proto-lens
- QuickCheck
- test-framework
- test-framework-quickcheck2
36 changes: 3 additions & 33 deletions proto-lens/src/Data/ProtoLens/Encoding/Bytes.hs
Original file line number Diff line number Diff line change
Expand Up @@ -42,38 +42,24 @@ module Data.ProtoLens.Encoding.Bytes(
(<?>),
) where

import qualified Data.Attoparsec.ByteString as Atto
import Data.Bits
import Data.ByteString (ByteString)
import Data.ByteString.Lazy.Builder as Builder
import qualified Data.ByteString.Lazy as L
import Data.Int (Int32, Int64)
import Data.Monoid ((<>))
import Data.Word (Word8, Word32, Word64)
import Data.Word (Word32, Word64)
import Foreign.Ptr (castPtr)
import Foreign.Marshal.Alloc (alloca)
import Foreign.Storable (Storable, peek, poke)
import System.IO.Unsafe (unsafePerformIO)

-- | A parsing monad for decoding the wire format.
newtype Parser a = Parser (Atto.Parser a)
deriving (Functor, Applicative, Monad)

-- | Evaluates a parser on the given input.
--
-- If the parser does not consume all of the input, the rest of the
-- input is discarded and the parser still succeeds.
runParser :: Parser a -> ByteString -> Either String a
runParser (Parser p) = Atto.parseOnly p
import Data.ProtoLens.Encoding.Parser

-- | Constructs a strict 'ByteString' from the given 'Builder'.
runBuilder :: Builder -> ByteString
runBuilder = L.toStrict . Builder.toLazyByteString

-- | Parse a @ByteString@ of the given length.
getBytes :: Int -> Parser ByteString
getBytes = Parser . Atto.take

-- | Emit a given @ByteString@.
putBytes :: ByteString -> Builder
putBytes = Builder.byteString
Expand All @@ -96,18 +82,8 @@ putVarInt n
| otherwise = Builder.word8 (fromIntegral $ n .&. 127 .|. 128)
<> putVarInt (n `shiftR` 7)

getWord8 :: Parser Word8
getWord8 = Parser Atto.anyWord8

-- | Little-endian decoding function.
getFixed32 :: Parser Word32
getFixed32 = do
b1 <- getWord8
b2 <- getWord8
b3 <- getWord8
b4 <- getWord8
return $ ((fromIntegral b4 `shiftL` 8 + fromIntegral b3)
`shiftL` 8 + fromIntegral b2) `shiftL` 8 + fromIntegral b1
getFixed32 = getWord32le

getFixed64 :: Parser Word64
getFixed64 = do
Expand Down Expand Up @@ -163,9 +139,3 @@ wordToSignedInt64 n

runEither :: Either String a -> Parser a
runEither = either fail return

atEnd :: Parser Bool
atEnd = Parser Atto.atEnd

(<?>) :: Parser a -> String -> Parser a
Parser p <?> msg = Parser (p Atto.<?> msg)
101 changes: 101 additions & 0 deletions proto-lens/src/Data/ProtoLens/Encoding/Parser.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
-- | A custom parsing monad, optimized for speed.
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Data.ProtoLens.Encoding.Parser
( Parser
, runParser
, atEnd
, getWord8
, getWord32le
, getBytes
, (<?>)
) where

import Data.Bits (shiftL, (.|.))
import Foreign.Ptr
import Foreign.Storable
import Control.Monad.Trans.Except
import Data.Word (Word8, Word32)
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B
import Data.ByteString (ByteString)
import Control.Monad (ap)
import Control.Monad.IO.Class
import System.IO.Unsafe

-- | A monad for parsing an input buffer.
newtype Parser a = Parser
{ unParser :: Ptr Word8 -- ^ End position of the input
-> Ptr Word8 -- ^ Current position in the input
-> ExceptT String IO (ParserResult a) }

data ParserResult a = ParserResult
{ _newPos :: !(Ptr Word8) -- ^ New position in the input
, unParserResult :: a
}

instance Functor ParserResult where
fmap f (ParserResult p x) = ParserResult p (f x)

instance Functor Parser where
fmap f (Parser g) = Parser $ \end cur -> fmap (fmap f) $ g end cur

instance Applicative Parser where
pure x = Parser $ \_ cur -> return $ ParserResult cur x
(<*>) = ap

instance Monad Parser where
fail s = Parser $ \_ _ -> throwE s
return = pure
Parser f >>= g = Parser $ \end pos -> do
ParserResult pos' x <- f end pos
unParser (g x) end pos'

-- | Evaluates a parser on the given input.
--
-- If the parser does not consume all of the input, the rest of the
-- input is discarded and the parser still succeeds.
runParser :: Parser a -> ByteString -> Either String a
runParser (Parser m) b = unsafePerformIO $ B.unsafeUseAsCStringLen b $ \(p, len)
-> runExceptT $ fmap unParserResult $ m (p `plusPtr` len) (castPtr p)

atEnd :: Parser Bool
atEnd = Parser $ \end pos -> return $ ParserResult pos (pos == end)

-- | Parse a one-byte word.
getWord8 :: Parser Word8
getWord8 = withSized 1 "getWord8: Unexpected end of input" peek

-- | Parser a 4-byte word in little-endian order.
getWord32le :: Parser Word32
getWord32le = withSized 4 "getWord32le: Unexpected end of input" $ \pos -> do
b1 <- peek pos
b2 <- peek $ pos `plusPtr'` 1
b3 <- peek $ pos `plusPtr'` 2
b4 <- peek $ pos `plusPtr'` 3
return $ ((fromIntegral b4 `shiftL` 8 .|. fromIntegral b3)
`shiftL` 8 .|. fromIntegral b2) `shiftL` 8 .|. fromIntegral b1

-- | Parse a 'B.ByteString' of the given length.
getBytes :: Int -> Parser B.ByteString
getBytes n = withSized n "getBytes: Unexpected end of input" $ \pos ->
B.packCStringLen (castPtr pos, n)

-- | Helper function for reading bytes from the current position and
-- advancing the pointer.
withSized :: Int -> String -> (Ptr Word8 -> IO a) -> Parser a
withSized len message f = Parser $ \end pos ->
let pos' = pos `plusPtr'` len
in if pos' > end
then throwE $ message
else liftIO $ ParserResult pos' <$> f pos
{-# INLINE withSized #-}

-- | If the parser fails, prepend an error message.
(<?>) :: Parser a -> String -> Parser a
Parser m <?> msg = Parser $ \end p -> withExceptT (\s -> msg ++ ": " ++ s) $ m end p

-- | Advance a pointer. Unlike 'plusPtr', preserves the type of the input.
plusPtr' :: Ptr a -> Int -> Ptr a
plusPtr' = plusPtr
71 changes: 71 additions & 0 deletions proto-lens/tests/parser_test.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
-- | Unit and property tests for of our custom parsing monad.
module Main (main) where

import qualified Data.ByteString as B
import Data.Either (isLeft)

import Test.QuickCheck
import Test.Framework (defaultMain, testGroup, Test)
import Test.Framework.Providers.QuickCheck2 (testProperty)

import Data.ProtoLens.Encoding.Bytes
import Data.ProtoLens.Encoding.Parser

main :: IO ()
main = defaultMain
[ testGroup "getWord8" testGetWord8
, testGroup "getBytes" testGetBytes
, testGroup "getWord32le" testGetWord32le
, testGroup "failure" testFailure
]

testGetWord8 :: [Test]
testGetWord8 =
[ testProperty "atEnd" $ \ws -> runParser atEnd (B.pack ws) === Right (null ws)
, testProperty "manyTillEnd"
$ \ws -> runParser (manyTillEnd getWord8) (B.pack ws) === Right ws
, testProperty "idem" $ \x -> x ==> x
]

testGetBytes :: [Test]
testGetBytes =
[ testProperty "many"
$ \ws -> let
packed = map B.pack ws
in runParser (mapM (getBytes . B.length) packed) (B.concat packed)
=== Right packed
]

testGetWord32le :: [Test]
testGetWord32le =
[ testProperty "align"
$ \ws -> length ws `mod` 4 /= 0 ==>
counterexampleF isLeft (runParser (manyTillEnd getWord32le) (B.pack ws))
, testProperty "manyTillEnd" $ \ws ->
runParser (manyTillEnd getWord32le) (runBuilder $ foldMap putFixed32 ws)
=== Right ws
]

testFailure :: [Test]
testFailure =
[ testProperty "fail-fast" $ \bs ->
runParser (fail "abcde") (B.pack bs)
=== (Left "abcde" :: Either String ())
, testProperty "<?>" $ \bs ->
runParser (fail "abcde" <?> "fghij") (B.pack bs)
=== (Left "fghij: abcde" :: Either String ())
]

-- Since this is a test, just implement the slow stack-heavy way.
manyTillEnd :: Parser a -> Parser [a]
manyTillEnd p = do
end <- atEnd
if end
then return []
else do
x <- p
xs <- manyTillEnd p
return $ x : xs

counterexampleF :: (Testable prop, Show a) => (a -> prop) -> a -> Property
counterexampleF f x = counterexample (show x) $ f x

0 comments on commit ea02c34

Please sign in to comment.