sparsec

simple Haskell parser combinator library
git clone git://git.rr3.xyz/sparsec
Log | Files | Refs | README | LICENSE

Sparsec.hs (14187B)


      1 module Sparsec (
      2     module Sparsec.Char,
      3     module Sparsec.Loc,
      4     module Sparsec.Span,
      5     -- State
      6     State#,
      7     pattern State#,
      8     stateInput#,
      9     stateLoc#,
     10     stateEq#,
     11     State (..),
     12     pattern State,
     13     -- Result
     14     Result#,
     15     pattern Utf8Error#,
     16     pattern Failure#,
     17     pattern Error#,
     18     pattern Success#,
     19     Result (..),
     20     -- Parse monad
     21     ParseT (..),
     22     runParseT#,
     23     runParseT,
     24     mapParseT,
     25     Parse,
     26     runParse#,
     27     runParse,
     28     mapParse,
     29     -- Core combinators
     30     fail,
     31     err,
     32     succeed,
     33     continue,
     34     get#,
     35     put#,
     36     -- General combinators
     37     not,
     38     try,
     39     cut,
     40     catch,
     41     branch,
     42     lookahead,
     43     withSpan,
     44     withConsumed,
     45     iter,
     46     chainl,
     47     chainr,
     48     many,
     49     some,
     50     many_,
     51     some_,
     52     manySepBy,
     53     someSepBy,
     54     choose,
     55     match,
     56     -- Utf8 combinators
     57     next,
     58     eof,
     59     anyChar,
     60     charIf,
     61     char,
     62     string,
     63     natural,
     64 ) where
     65 
     66 import Control.Applicative
     67 import Control.Monad hiding (fail)
     68 import Data.ByteString (ByteString)
     69 import Data.ByteString qualified as BS
     70 import Data.ByteString.UTF8 qualified as UTF8
     71 import Data.Char
     72 import Data.Foldable
     73 import Data.Functor
     74 import GHC.Exts (Int (..), (-#))
     75 import Numeric.Natural
     76 import Prelude hiding (fail, not, read)
     77 
     78 import Sparsec.Char
     79 import Sparsec.Loc
     80 import Sparsec.Span
     81 
     82 --------------------------------------------------------------------------------
     83 -- State#
     84 
     85 newtype State# = State_# (# ByteString, Loc# #)
     86 
     87 pattern State# :: ByteString -> Loc# -> State#
     88 pattern State# i l = State_# (# i, l #)
     89 {-# COMPLETE State# #-}
     90 
     91 stateInput# :: State# -> ByteString
     92 stateInput# (State# i _) = i
     93 {-# INLINE stateInput# #-}
     94 
     95 stateLoc# :: State# -> Loc#
     96 stateLoc# (State# _ l) = l
     97 {-# INLINE stateLoc# #-}
     98 
     99 stateEq# :: State# -> State# -> Bool
    100 stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && l0 `locEq#` l1
    101 {-# INLINE stateEq# #-}
    102 
    103 data State = StateBox {stateUnbox :: State#}
    104 
    105 pattern State :: ByteString -> Loc# -> State
    106 pattern State i l = StateBox (State# i l)
    107 {-# COMPLETE State #-}
    108 
    109 instance Eq State where
    110     StateBox s0 == StateBox s1 = s0 `stateEq#` s1
    111     {-# INLINE (==) #-}
    112 
    113 --------------------------------------------------------------------------------
    114 -- Result#
    115 
    116 newtype Result# e a = Result# (# Loc# | (# #) | e | (# a, State# #) #)
    117 
    118 pattern Utf8Error# :: Loc# -> Result# e a
    119 pattern Utf8Error# l = Result# (# l | | | #)
    120 
    121 pattern Failure# :: Result# e a
    122 pattern Failure# = Result# (# | (# #) | | #)
    123 
    124 pattern Error# :: e -> Result# e a
    125 pattern Error# e = Result# (# | | e | #)
    126 
    127 pattern Success# :: a -> State# -> Result# e a
    128 pattern Success# a s = Result# (# | | | (# a, s #) #)
    129 
    130 {-# COMPLETE Utf8Error#, Failure#, Error#, Success# #-}
    131 
    132 data Result e a = ResultBox { resultUnbox :: Result# e a }
    133 
    134 -- instance (Eq e, Eq a) => Eq (Result# e a) where
    135 --     Utf8Error# l0 == Utf8Error# l1 = l0 `locEq#` l1
    136 --     Failure# == Failure# = True
    137 --     Error# e0 == Error# e1 = e0 == e1
    138 --     Success# a0 s0 == Success# a1 s1 = a0 == a1 && s0 `stateEq#` s1
    139 --     _ == _ = False
    140 --     {-# INLINE (==) #-}
    141 
    142 -- instance (Show e, Show a) => Show (Result# e a) where
    143 --     show = \case
    144 --         Utf8Error# l -> "utf8 error at " ++ locShow# l
    145 --         Failure# -> "failure"
    146 --         Error# e -> "error: " ++ show e
    147 --         Success# a (State# i _) ->
    148 --             if BS.null i
    149 --                 then printf "success: %s" (show a)
    150 --                 else printf "success (%d bytes remaining): %s" (BS.length i) (show a)
    151 
    152 -- instance Functor (Result# e) where
    153 --     fmap f = \case
    154 --         Utf8Error# l -> Utf8Error# l
    155 --         Failure# -> Failure#
    156 --         Error# e -> Error# e
    157 --         Success# a s -> Success# (f a) s
    158 --     {-# INLINE fmap #-}
    159 
    160 --------------------------------------------------------------------------------
    161 -- Result
    162 
    163 -- TODO?
    164 
    165 --------------------------------------------------------------------------------
    166 -- ParseT
    167 
    168 -- | ParseT is a monad transformer for parsing. It effectively has a ByteString
    169 -- and Loc state effect for the input, and an error effect (of some type e) for
    170 -- parsing errors. Parsing *errors* are distinct from parsing *failures* in
    171 -- that only the latter trigger backtracking in the Alternative instance.
    172 newtype ParseT e a = ParseT (State# -> Result# e a)
    173 
    174 runParseT# :: ParseT e a -> State# -> Result# e a
    175 runParseT# (ParseT f) = f
    176 {-# INLINE runParseT# #-}
    177 
    178 runParseT :: ParseT e a -> ByteString -> Loc -> Result e a
    179 runParseT p i l = ResultBox $ p `runParseT#` State# i (locUnbox l)
    180 {-# INLINE runParseT #-}
    181 
    182 mapParseT ::
    183     (Result# e a -> Result# e' a') ->
    184     ParseT e a ->
    185     ParseT e' a'
    186 mapParseT f p = ParseT \s -> f $ p `runParseT#` s
    187 {-# INLINE mapParseT #-}
    188 
    189 --------------------------------------------------------------------------------
    190 -- Parse
    191 
    192 type Parse e a = ParseT e a
    193 
    194 runParse# :: Parse e a -> State# -> Result# e a
    195 runParse# = runParseT#
    196 {-# INLINE runParse# #-}
    197 
    198 runParse :: Parse e a -> ByteString -> Loc -> Result e a
    199 runParse = runParseT
    200 {-# INLINE runParse #-}
    201 
    202 mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a'
    203 mapParse = mapParseT
    204 {-# INLINE mapParse #-}
    205 
    206 --------------------------------------------------------------------------------
    207 -- Core combinators
    208 
    209 fail :: ParseT e a
    210 fail = ParseT \_s -> Failure#
    211 {-# INLINE fail #-}
    212 
    213 err :: e -> ParseT e a
    214 err e = ParseT \_s -> Error# e
    215 {-# INLINE err #-}
    216 
    217 succeed :: a -> ParseT e a
    218 succeed a = ParseT \s -> Success# a s
    219 {-# INLINE succeed #-}
    220 
    221 continue :: ParseT f b -> (e -> ParseT f b) -> (a -> ParseT f b) -> ParseT e a -> ParseT f b
    222 continue kf ke ks p = ParseT \s ->
    223     case p `runParseT#` s of
    224         Utf8Error# l -> Utf8Error# l
    225         Failure# -> kf `runParseT#` s
    226         Error# e -> ke e `runParseT#` s
    227         Success# a s' -> ks a `runParseT#` s'
    228 {-# INLINE continue #-}
    229 
    230 get# :: (State# -> ParseT e a) -> ParseT e a
    231 get# f = ParseT \s -> f s `runParseT#` s
    232 {-# INLINE get# #-}
    233 
    234 put# :: State# -> ParseT e a -> ParseT e a
    235 put# s p = ParseT \_ -> p `runParseT#` s
    236 {-# INLINE put# #-}
    237 
    238 -- | Read the entire input without consuming it.
    239 -- getInput :: (Monad m) => ParseT e m ByteString
    240 -- getInput = ParseT \s -> pure $ Success# (stateInput# s) s
    241 
    242 -- | Replace the entire input without affecting the current location.
    243 -- putInput :: (Monad m) => ByteString -> ParseT e m ()
    244 -- putInput i = ParseT \(State# _ l) -> pure $ Success# () (State# i l)
    245 
    246 -- modifyInput :: (Monad m) => (ByteString -> ByteString) -> Parse e m ()
    247 -- modifyInput f = ParseT \(State# i l) -> pure $ Success# () (State (f i) l)
    248 
    249 -- getLoc :: (Monad m) => ParseT e m Loc
    250 -- getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s
    251 
    252 -- putLoc :: (Monad m) => Loc -> ParseT e m ()
    253 -- putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l))
    254 
    255 -- modifyLoc# :: (Monad m) => (Loc# -> Loc#) -> ParseT e m ()
    256 -- modifyLoc# f = ParseT \(State# i l) -> pure $ Success# () (State# i (f l))
    257 
    258 -- modifyLoc :: (Monad m) => (Loc -> Loc) -> ParseT e m ()
    259 -- modifyLoc f = ParseT \(State# i l) -> pure $ Success# () (State# i (unboxLoc . f . boxLoc $ l))
    260 
    261 -- save :: (Monad m) => ParseT e m State
    262 -- save = ParseT \s -> pure $ Success# (stateBox s) s
    263 
    264 -- load :: (Monad m) => State -> ParseT e m ()
    265 -- load s = ParseT \_ -> pure $ Success# () (stateUnbox s)
    266 
    267 --------------------------------------------------------------------------------
    268 -- Instances
    269 
    270 instance Functor (ParseT e) where
    271     fmap f p = continue fail err (succeed . f) p
    272     {-# INLINE fmap #-}
    273     a <$ p = continue fail err (const $ succeed a) p
    274     {-# INLINE (<$) #-}
    275 
    276 instance Applicative (ParseT e) where
    277     pure = succeed
    278     {-# INLINE pure #-}
    279     p <*> q = continue fail err (<$> q) p
    280     {-# INLINE (<*>) #-}
    281     p *> q = continue fail err (const q) p
    282     {-# INLINE (*>) #-}
    283     p <* q = continue fail err (<$ q) p
    284     {-# INLINE (<*) #-}
    285 
    286 instance Monad (ParseT e) where
    287     return = pure
    288     {-# INLINE return #-}
    289     p >>= k = continue fail err k p
    290     {-# INLINE (>>=) #-}
    291     (>>) = (*>)
    292     {-# INLINE (>>) #-}
    293 
    294 instance Alternative (ParseT e) where
    295     empty = fail
    296     {-# INLINE empty #-}
    297     p <|> q = continue q err succeed p
    298     {-# INLINE (<|>) #-}
    299     many p = ParseT go
    300       where
    301         go s = case p `runParseT#` s of
    302             Utf8Error# l -> Utf8Error# l
    303             Failure# -> Success# [] s
    304             Error# e -> Error# e
    305             Success# a s' -> case go s' of
    306                 Success# as s'' -> Success# (a:as) s''
    307                 x -> x
    308     {-# INLINE many #-}
    309     some p = (:) <$> p <*> many p
    310     {-# INLINE some #-}
    311 
    312 --------------------------------------------------------------------------------
    313 -- General combinators
    314 
    315 -- TODO: Boxed variants of functions
    316 -- TODO: Non-CPS variants of functions
    317 
    318 -- | Turn a failure into a success and vice versa.
    319 not :: ParseT e a -> ParseT e ()
    320 not = continue (succeed ()) err (const fail)
    321 {-# INLINE not #-}
    322 
    323 -- | Convert an error into a failure.
    324 try :: ParseT e a -> ParseT e a
    325 try = continue fail (const fail) succeed
    326 {-# INLINE try #-}
    327 
    328 -- | Convert a failure into an error.
    329 cut :: ParseT e a -> e -> ParseT e a
    330 p `cut` e = continue (err e) err succeed p
    331 {-# INLINE cut #-}
    332 
    333 -- | Catch an error.
    334 catch :: ParseT e a -> (e -> ParseT f a) -> ParseT f a
    335 p `catch` h = continue fail h succeed p
    336 {-# INLINE catch #-}
    337 
    338 -- | `branch p ks kf` runs p; if it succeeds, it continues with ks; if it fails,
    339 -- it continues with kf.
    340 branch :: ParseT e a -> (a -> ParseT e b) -> ParseT e b -> ParseT e b
    341 branch p ks kf = continue kf err ks p
    342 {-# INLINE branch #-}
    343 
    344 -- | Run a parser without changing the parser state.
    345 lookahead :: ParseT e a -> ParseT e a
    346 lookahead p = get# \s -> p <* put# s (pure ())
    347 {-# INLINE lookahead #-}
    348 
    349 withSpan :: (a -> Span# -> ParseT e b) -> ParseT e a -> ParseT e b
    350 withSpan k p =
    351     get# \(State# _ beg) ->
    352         p >>= \a ->
    353             get# \(State# _ end) ->
    354                 k a (Span# beg end)
    355 {-# INLINE withSpan #-}
    356 
    357 withConsumed :: (a -> ByteString -> ParseT e b) -> ParseT e a -> ParseT e b
    358 withConsumed k p =
    359     get# \(State# i (Loc# b0 _ _)) ->
    360         p >>= \a ->
    361             get# \(State# _ (Loc# b1 _ _)) ->
    362                 k a (BS.take (I# (b1 -# b0)) i)
    363 {-# INLINE withConsumed #-}
    364 
    365 -- | Iterate a parsing function until it fails. I.e.,
    366 -- @iter f a = pure a >>= f >>= f >>= ...@
    367 -- where the chain of binds is as long as possible without failure.
    368 iter :: (a -> ParseT e a) -> a -> ParseT e a
    369 iter f a = (f a >>= iter f) <|> pure a
    370 {-# INLINE iter #-}
    371 
    372 -- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested
    373 -- fashion.
    374 chainl :: (b -> a -> b) -> ParseT e b -> ParseT e a -> ParseT e b
    375 chainl f pb pa = pb >>= iter \b -> f b <$> pa
    376 {-# INLINE chainl #-}
    377 
    378 -- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a
    379 -- right-nested fashion.
    380 chainr :: (a -> b -> b) -> ParseT e a -> ParseT e b -> ParseT e b
    381 chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb
    382 {-# INLINE chainr #-}
    383 
    384 many_ :: ParseT e a -> ParseT e ()
    385 -- many_ p = some_ p <|> pure ()
    386 many_ p = ParseT go
    387   where
    388     go s = case p `runParseT#` s of
    389         Utf8Error# l -> Utf8Error# l
    390         Failure# -> Success# () s
    391         Error# e -> Error# e
    392         Success# _ s' -> go s'
    393 {-# INLINE many_ #-}
    394 
    395 some_ :: ParseT e a -> ParseT e ()
    396 some_ p = p *> many_ p
    397 {-# INLINE some_ #-}
    398 
    399 manySepBy :: ParseT e a -> ParseT e b -> ParseT e [a]
    400 manySepBy p sep = someSepBy p sep <|> pure []
    401 {-# INLINE manySepBy #-}
    402 
    403 someSepBy :: ParseT e a -> ParseT e b -> ParseT e [a]
    404 someSepBy p sep = (:) <$> p <*> many (sep *> p)
    405 {-# INLINE someSepBy #-}
    406 
    407 choose :: (Foldable t) => t (ParseT e a) -> ParseT e a
    408 choose = asum
    409 {-# INLINE choose #-}
    410 
    411 -- | `match scrut cases` binds scrut to the first non-failing case and returns
    412 -- the result.
    413 match ::
    414     (Functor t, Foldable t) =>
    415     ParseT e a ->
    416     t (a -> ParseT e b) ->
    417     ParseT e b
    418 match scrut cases = choose $ (scrut >>=) <$> cases
    419 {-# INLINE match #-}
    420 
    421 --------------------------------------------------------------------------------
    422 -- Utf8 combinators
    423 
    424 -- TODO: Try writing custom utf8 decoder that doesn't allocate Maybes.
    425 -- TODO: Non-CPS version of next?
    426 
    427 next :: ParseT e a -> (Char -> ParseT e a) -> ParseT e a
    428 next ke kc = ParseT \s@(State# i l) ->
    429     case UTF8.decode i of
    430         Just ('\xFFFD', _) -> Utf8Error# l
    431         Just (c, w) ->
    432             -- TODO: This strictness annotation apparently does nothing.
    433             let !i' = BS.drop w i
    434             in kc c `runParseT#` State# i' (locAdvance# c w l)
    435         Nothing -> ke `runParseT#` s
    436 {-# INLINE next #-}
    437 
    438 eof :: ParseT e ()
    439 eof = get# \(State# i _) -> guard (BS.null i)
    440 {-# INLINE eof #-}
    441 
    442 anyChar :: ParseT e Char
    443 anyChar = next fail succeed
    444 {-# INLINE anyChar #-}
    445 
    446 charIf :: (Char -> Bool) -> ParseT e Char
    447 charIf want = next fail (\c -> guard (want c) $> c)
    448 {-# INLINE charIf #-}
    449 
    450 char :: Char -> ParseT e ()
    451 char c = void $ charIf (== c)
    452 {-# INLINE char #-}
    453 
    454 string :: String -> ParseT e ()
    455 string = traverse_ char
    456 {-# INLINE string #-}
    457 
    458 natural :: Int -> ParseT e Natural
    459 natural = \case
    460     0 ->
    461         let prefix l u = char '0' *> (char l <|> char u)
    462         in prefix 'b' 'B' *> natural 2
    463             <|> prefix 'q' 'Q' *> natural 4
    464             <|> prefix 'o' 'O' *> natural 8
    465             <|> prefix 'x' 'X' *> natural 16
    466             <|> natural 10
    467     b
    468         | 0 < b && b <= 36 ->
    469             let nat :: Int -> Natural
    470                 nat = fromIntegral
    471 
    472                 maxn = min (chr $ ord '0' + b) (succ '9')
    473                 maxu = min (chr $ ord 'A' + b - 10) (succ 'Z')
    474                 maxl = min (chr $ ord 'a' + b - 10) (succ 'z')
    475 
    476                 numeral =
    477                     anyChar >>= \c ->
    478                         if
    479                             | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0')
    480                             | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10)
    481                             | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10)
    482                             | otherwise -> fail
    483 
    484                 underscores = many (char '_')
    485             in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral))
    486     _ -> error "natural: invalid base"
    487 {-# INLINE natural #-}