Use a custom parsing monad instead of attoparsec.

All decoding benchmarks show significant speedups after this change. The biggest improvement is to decoding packed data which is 4-5x as fast as before. (See below for a full list of benchmark diffs.) This parsing monad follows the approach of, e.g., the `store` and `persist` packages. It requires that all data be in a *strict* `ByteString`, and uses simple pointer arithmetic internally to walk through its bytes. This effectively works against #62 (streaming parsers) since it needs to read all the input data before starting the parse. However, that issue has already existed since the beginning of this library for, e.g., submessages; see that bug for more details. So this change doesn't appear to be a regression. We also have freedom to later try out different implementations without changing the API, since `Parser` is opaque as of #294. The implementation of Parser differs from `store` and `persist` by using `ExceptT` to pass around errors internally, rather than exceptions (or closures, as in `attoparsec`). We may want to experiment with this later, but in my initial experiments I didn't see a significant improvement from those approaches. Benchmark results (the "time" output from Criterion): flat(602B)/decode/whnf: 13.14 μs (13.02 μs .. 13.29 μs) => 8.686 μs (8.514 μs .. 8.873 μs) nested(900B)/decode/whnf: 26.35 μs (25.85 μs .. 26.86 μs) => 14.01 μs (13.86 μs .. 14.18 μs) int32-packed(1003B)/decode/whnf: 36.23 μs (35.75 μs .. 36.69 μs) => 17.31 μs (17.11 μs .. 17.50 μs) int32-unpacked(2000B)/decode/whnf: 65.18 μs (64.19 μs .. 66.68 μs) => 19.35 μs (19.13 μs .. 19.58 μs) float-packed(4003B)/decode/whnf: 78.61 μs (77.53 μs .. 79.46 μs) => 19.56 μs (19.40 μs .. 19.76 μs) float-unpacked(5000B)/decode/whnf: 108.9 μs (107.8 μs .. 110.3 μs) => 22.29 μs (22.00 μs .. 22.66 μs) no-unused(10003B)/decode/whnf: 571.7 μs (560.0 μs .. 586.6 μs) => 356.5 μs (349.0 μs .. 365.0 μs) with-unused(10003B)/decode/whnf: 786.6 μs (697.8 μs .. 875.5 μs) => 368.3 μs (361.8 μs .. 376.4 μs)
google · Dec 21, 2018 · ea02c34 · ea02c34
1 parent 123e486
commit ea02c34
Show file tree

Hide file tree

Showing 4 changed files with 187 additions and 49 deletions.
diff --git a/proto-lens/package.yaml b/proto-lens/package.yaml
@@ -29,26 +29,10 @@ data-files:
 
 library:
   source-dirs: src
-  exposed-modules:
-    - Data.ProtoLens
-    - Data.ProtoLens.Combinators
-    - Data.ProtoLens.Default
-    - Data.ProtoLens.Encoding
-    - Data.ProtoLens.Encoding.Bytes
-    - Data.ProtoLens.Encoding.Wire
-    - Data.ProtoLens.Message
-    - Data.ProtoLens.Message.Enum
-    - Data.ProtoLens.Service.Types
-    - Data.ProtoLens.TextFormat
-    - Proto.Google.Protobuf.Compiler.Plugin
-    - Proto.Google.Protobuf.Compiler.Plugin_Fields
-    - Proto.Google.Protobuf.Descriptor
-    - Proto.Google.Protobuf.Descriptor_Fields
 
   other-modules:
     - Data.ProtoLens.TextFormat.Parser
   dependencies:
-    - attoparsec == 0.13.*
     - base >= 4.9 && < 4.13
     - bytestring == 0.10.*
     - containers >= 0.5 && < 0.7
@@ -60,3 +44,15 @@ library:
     - text == 1.2.*
     - transformers >= 0.4 && < 0.6
     - void == 0.7.*
+
+tests:
+  parser_test:
+    main: parser_test.hs
+    source-dirs: tests
+    dependencies:
+    - base
+    - bytestring
+    - proto-lens
+    - QuickCheck
+    - test-framework
+    - test-framework-quickcheck2
diff --git a/proto-lens/src/Data/ProtoLens/Encoding/Bytes.hs b/proto-lens/src/Data/ProtoLens/Encoding/Bytes.hs
@@ -42,38 +42,24 @@ module Data.ProtoLens.Encoding.Bytes(
     (<?>),
     ) where
 
-import qualified Data.Attoparsec.ByteString as Atto
 import Data.Bits
 import Data.ByteString (ByteString)
 import Data.ByteString.Lazy.Builder as Builder
 import qualified Data.ByteString.Lazy as L
 import Data.Int (Int32, Int64)
 import Data.Monoid ((<>))
-import Data.Word (Word8, Word32, Word64)
+import Data.Word (Word32, Word64)
 import Foreign.Ptr (castPtr)
 import Foreign.Marshal.Alloc (alloca)
 import Foreign.Storable (Storable, peek, poke)
 import System.IO.Unsafe (unsafePerformIO)
 
--- | A parsing monad for decoding the wire format.
-newtype Parser a = Parser (Atto.Parser a)
-    deriving (Functor, Applicative, Monad)
-
--- | Evaluates a parser on the given input.
---
--- If the parser does not consume all of the input, the rest of the
--- input is discarded and the parser still succeeds.
-runParser :: Parser a -> ByteString -> Either String a
-runParser (Parser p) = Atto.parseOnly p
+import Data.ProtoLens.Encoding.Parser
 
 -- | Constructs a strict 'ByteString' from the given 'Builder'.
 runBuilder :: Builder -> ByteString
 runBuilder = L.toStrict . Builder.toLazyByteString
 
--- | Parse a @ByteString@ of the given length.
-getBytes :: Int -> Parser ByteString
-getBytes = Parser . Atto.take
-
 -- | Emit a given @ByteString@.
 putBytes :: ByteString -> Builder
 putBytes = Builder.byteString
@@ -96,18 +82,8 @@ putVarInt n
     | otherwise = Builder.word8 (fromIntegral $ n .&. 127 .|. 128)
                       <> putVarInt (n `shiftR` 7)
 
-getWord8 :: Parser Word8
-getWord8 = Parser Atto.anyWord8
-
--- | Little-endian decoding function.
 getFixed32 :: Parser Word32
-getFixed32 = do
-    b1 <- getWord8
-    b2 <- getWord8
-    b3 <- getWord8
-    b4 <- getWord8
-    return $ ((fromIntegral b4 `shiftL` 8 + fromIntegral b3)
-                `shiftL` 8 + fromIntegral b2) `shiftL` 8 + fromIntegral b1
+getFixed32 = getWord32le
 
 getFixed64 :: Parser Word64
 getFixed64 = do
@@ -163,9 +139,3 @@ wordToSignedInt64 n
 
 runEither :: Either String a -> Parser a
 runEither = either fail return
-
-atEnd :: Parser Bool
-atEnd = Parser Atto.atEnd
-
-(<?>) :: Parser a -> String -> Parser a
-Parser p <?> msg = Parser (p Atto.<?> msg)
diff --git a/proto-lens/src/Data/ProtoLens/Encoding/Parser.hs b/proto-lens/src/Data/ProtoLens/Encoding/Parser.hs
@@ -0,0 +1,101 @@
+-- | A custom parsing monad, optimized for speed.
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+module Data.ProtoLens.Encoding.Parser
+    ( Parser
+    , runParser
+    , atEnd
+    , getWord8
+    , getWord32le
+    , getBytes
+    , (<?>)
+    ) where
+
+import Data.Bits (shiftL, (.|.))
+import Foreign.Ptr
+import Foreign.Storable
+import Control.Monad.Trans.Except
+import Data.Word (Word8, Word32)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Unsafe as B
+import Data.ByteString (ByteString)
+import Control.Monad (ap)
+import Control.Monad.IO.Class
+import System.IO.Unsafe
+
+-- | A monad for parsing an input buffer.
+newtype Parser a = Parser
+    { unParser :: Ptr Word8 -- ^ End position of the input
+               -> Ptr Word8 -- ^ Current position in the input
+               -> ExceptT String IO (ParserResult a) }
+
+data ParserResult a = ParserResult
+    { _newPos :: !(Ptr Word8) -- ^ New position in the input
+    , unParserResult :: a
+    }
+
+instance Functor ParserResult where
+    fmap f (ParserResult p x) = ParserResult p (f x)
+
+instance Functor Parser where
+    fmap f (Parser g) = Parser $ \end cur -> fmap (fmap f) $ g end cur
+
+instance Applicative Parser where
+    pure x = Parser $ \_ cur -> return $ ParserResult cur x
+    (<*>) = ap
+
+instance Monad Parser where
+    fail s = Parser $ \_ _ -> throwE s
+    return = pure
+    Parser f >>= g = Parser $ \end pos -> do
+        ParserResult pos' x <- f end pos
+        unParser (g x) end pos'
+
+-- | Evaluates a parser on the given input.
+--
+-- If the parser does not consume all of the input, the rest of the
+-- input is discarded and the parser still succeeds.
+runParser :: Parser a -> ByteString -> Either String a
+runParser (Parser m) b = unsafePerformIO $ B.unsafeUseAsCStringLen b $ \(p, len)
+    -> runExceptT $ fmap unParserResult $ m (p `plusPtr` len) (castPtr p)
+
+atEnd :: Parser Bool
+atEnd = Parser $ \end pos -> return $ ParserResult pos (pos == end)
+
+-- | Parse a one-byte word.
+getWord8 :: Parser Word8
+getWord8 = withSized 1 "getWord8: Unexpected end of input" peek
+
+-- | Parser a 4-byte word in little-endian order.
+getWord32le :: Parser Word32
+getWord32le = withSized 4 "getWord32le: Unexpected end of input" $ \pos -> do
+    b1 <- peek pos
+    b2 <- peek $ pos `plusPtr'` 1
+    b3 <- peek $ pos `plusPtr'` 2
+    b4 <- peek $ pos `plusPtr'` 3
+    return $ ((fromIntegral b4 `shiftL` 8 .|. fromIntegral b3)
+                `shiftL` 8 .|. fromIntegral b2) `shiftL` 8 .|. fromIntegral b1
+
+-- | Parse a 'B.ByteString' of the given length.
+getBytes :: Int -> Parser B.ByteString
+getBytes n = withSized n "getBytes: Unexpected end of input" $ \pos ->
+    B.packCStringLen (castPtr pos, n)
+
+-- | Helper function for reading bytes from the current position and
+-- advancing the pointer.
+withSized :: Int -> String -> (Ptr Word8 -> IO a) -> Parser a
+withSized len message f = Parser $ \end pos ->
+    let pos' = pos `plusPtr'` len
+    in if pos' > end
+        then throwE $ message
+        else liftIO $ ParserResult pos' <$> f pos
+{-# INLINE withSized #-}
+
+-- | If the parser fails, prepend an error message.
+(<?>) :: Parser a -> String -> Parser a
+Parser m <?> msg = Parser $ \end p -> withExceptT (\s -> msg ++ ": " ++ s) $ m end p
+
+-- | Advance a pointer.  Unlike 'plusPtr', preserves the type of the input.
+plusPtr' :: Ptr a -> Int -> Ptr a
+plusPtr' = plusPtr
diff --git a/proto-lens/tests/parser_test.hs b/proto-lens/tests/parser_test.hs
@@ -0,0 +1,71 @@
+-- | Unit and property tests for of our custom parsing monad.
+module Main (main) where
+
+import qualified Data.ByteString as B
+import Data.Either (isLeft)
+
+import Test.QuickCheck
+import Test.Framework (defaultMain, testGroup, Test)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+
+import Data.ProtoLens.Encoding.Bytes
+import Data.ProtoLens.Encoding.Parser
+
+main :: IO ()
+main = defaultMain
+    [ testGroup "getWord8" testGetWord8
+    , testGroup "getBytes" testGetBytes
+    , testGroup "getWord32le" testGetWord32le
+    , testGroup "failure" testFailure
+    ]
+
+testGetWord8 :: [Test]
+testGetWord8 =
+    [ testProperty "atEnd" $ \ws -> runParser atEnd (B.pack ws) === Right (null ws)
+    , testProperty "manyTillEnd"
+            $ \ws -> runParser (manyTillEnd getWord8) (B.pack ws) === Right ws
+    , testProperty "idem" $ \x -> x ==> x
+    ]
+
+testGetBytes :: [Test]
+testGetBytes =
+    [ testProperty "many"
+            $ \ws -> let
+                packed = map B.pack ws
+                in runParser (mapM (getBytes . B.length) packed) (B.concat packed)
+                    === Right packed
+    ]
+
+testGetWord32le :: [Test]
+testGetWord32le =
+    [ testProperty "align"
+        $ \ws -> length ws `mod` 4 /= 0 ==>
+                    counterexampleF isLeft (runParser (manyTillEnd getWord32le) (B.pack ws))
+    , testProperty "manyTillEnd" $ \ws ->
+            runParser (manyTillEnd getWord32le) (runBuilder $ foldMap putFixed32 ws)
+                === Right ws
+    ]
+
+testFailure :: [Test]
+testFailure =
+    [ testProperty "fail-fast" $ \bs ->
+        runParser (fail "abcde") (B.pack bs)
+            === (Left "abcde" :: Either String ())
+    , testProperty "<?>" $ \bs ->
+        runParser (fail "abcde" <?> "fghij") (B.pack bs)
+            === (Left "fghij: abcde" :: Either String ())
+    ]
+
+-- Since this is a test, just implement the slow stack-heavy way.
+manyTillEnd :: Parser a -> Parser [a]
+manyTillEnd p = do
+    end <- atEnd
+    if end
+        then return []
+        else do
+            x <- p
+            xs <- manyTillEnd p
+            return $ x : xs
+
+counterexampleF :: (Testable prop, Show a) => (a -> prop) -> a -> Property
+counterexampleF f x = counterexample (show x) $ f x