Sparsec.hs (14187B)
1 module Sparsec ( 2 module Sparsec.Char, 3 module Sparsec.Loc, 4 module Sparsec.Span, 5 -- State 6 State#, 7 pattern State#, 8 stateInput#, 9 stateLoc#, 10 stateEq#, 11 State (..), 12 pattern State, 13 -- Result 14 Result#, 15 pattern Utf8Error#, 16 pattern Failure#, 17 pattern Error#, 18 pattern Success#, 19 Result (..), 20 -- Parse monad 21 ParseT (..), 22 runParseT#, 23 runParseT, 24 mapParseT, 25 Parse, 26 runParse#, 27 runParse, 28 mapParse, 29 -- Core combinators 30 fail, 31 err, 32 succeed, 33 continue, 34 get#, 35 put#, 36 -- General combinators 37 not, 38 try, 39 cut, 40 catch, 41 branch, 42 lookahead, 43 withSpan, 44 withConsumed, 45 iter, 46 chainl, 47 chainr, 48 many, 49 some, 50 many_, 51 some_, 52 manySepBy, 53 someSepBy, 54 choose, 55 match, 56 -- Utf8 combinators 57 next, 58 eof, 59 anyChar, 60 charIf, 61 char, 62 string, 63 natural, 64 ) where 65 66 import Control.Applicative 67 import Control.Monad hiding (fail) 68 import Data.ByteString (ByteString) 69 import Data.ByteString qualified as BS 70 import Data.ByteString.UTF8 qualified as UTF8 71 import Data.Char 72 import Data.Foldable 73 import Data.Functor 74 import GHC.Exts (Int (..), (-#)) 75 import Numeric.Natural 76 import Prelude hiding (fail, not, read) 77 78 import Sparsec.Char 79 import Sparsec.Loc 80 import Sparsec.Span 81 82 -------------------------------------------------------------------------------- 83 -- State# 84 85 newtype State# = State_# (# ByteString, Loc# #) 86 87 pattern State# :: ByteString -> Loc# -> State# 88 pattern State# i l = State_# (# i, l #) 89 {-# COMPLETE State# #-} 90 91 stateInput# :: State# -> ByteString 92 stateInput# (State# i _) = i 93 {-# INLINE stateInput# #-} 94 95 stateLoc# :: State# -> Loc# 96 stateLoc# (State# _ l) = l 97 {-# INLINE stateLoc# #-} 98 99 stateEq# :: State# -> State# -> Bool 100 stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && l0 `locEq#` l1 101 {-# INLINE stateEq# #-} 102 103 data State = StateBox {stateUnbox :: State#} 104 105 pattern State :: ByteString -> Loc# -> State 106 pattern State i l = StateBox (State# i l) 107 {-# COMPLETE State #-} 108 109 instance Eq State where 110 StateBox s0 == StateBox s1 = s0 `stateEq#` s1 111 {-# INLINE (==) #-} 112 113 -------------------------------------------------------------------------------- 114 -- Result# 115 116 newtype Result# e a = Result# (# Loc# | (# #) | e | (# a, State# #) #) 117 118 pattern Utf8Error# :: Loc# -> Result# e a 119 pattern Utf8Error# l = Result# (# l | | | #) 120 121 pattern Failure# :: Result# e a 122 pattern Failure# = Result# (# | (# #) | | #) 123 124 pattern Error# :: e -> Result# e a 125 pattern Error# e = Result# (# | | e | #) 126 127 pattern Success# :: a -> State# -> Result# e a 128 pattern Success# a s = Result# (# | | | (# a, s #) #) 129 130 {-# COMPLETE Utf8Error#, Failure#, Error#, Success# #-} 131 132 data Result e a = ResultBox { resultUnbox :: Result# e a } 133 134 -- instance (Eq e, Eq a) => Eq (Result# e a) where 135 -- Utf8Error# l0 == Utf8Error# l1 = l0 `locEq#` l1 136 -- Failure# == Failure# = True 137 -- Error# e0 == Error# e1 = e0 == e1 138 -- Success# a0 s0 == Success# a1 s1 = a0 == a1 && s0 `stateEq#` s1 139 -- _ == _ = False 140 -- {-# INLINE (==) #-} 141 142 -- instance (Show e, Show a) => Show (Result# e a) where 143 -- show = \case 144 -- Utf8Error# l -> "utf8 error at " ++ locShow# l 145 -- Failure# -> "failure" 146 -- Error# e -> "error: " ++ show e 147 -- Success# a (State# i _) -> 148 -- if BS.null i 149 -- then printf "success: %s" (show a) 150 -- else printf "success (%d bytes remaining): %s" (BS.length i) (show a) 151 152 -- instance Functor (Result# e) where 153 -- fmap f = \case 154 -- Utf8Error# l -> Utf8Error# l 155 -- Failure# -> Failure# 156 -- Error# e -> Error# e 157 -- Success# a s -> Success# (f a) s 158 -- {-# INLINE fmap #-} 159 160 -------------------------------------------------------------------------------- 161 -- Result 162 163 -- TODO? 164 165 -------------------------------------------------------------------------------- 166 -- ParseT 167 168 -- | ParseT is a monad transformer for parsing. It effectively has a ByteString 169 -- and Loc state effect for the input, and an error effect (of some type e) for 170 -- parsing errors. Parsing *errors* are distinct from parsing *failures* in 171 -- that only the latter trigger backtracking in the Alternative instance. 172 newtype ParseT e a = ParseT (State# -> Result# e a) 173 174 runParseT# :: ParseT e a -> State# -> Result# e a 175 runParseT# (ParseT f) = f 176 {-# INLINE runParseT# #-} 177 178 runParseT :: ParseT e a -> ByteString -> Loc -> Result e a 179 runParseT p i l = ResultBox $ p `runParseT#` State# i (locUnbox l) 180 {-# INLINE runParseT #-} 181 182 mapParseT :: 183 (Result# e a -> Result# e' a') -> 184 ParseT e a -> 185 ParseT e' a' 186 mapParseT f p = ParseT \s -> f $ p `runParseT#` s 187 {-# INLINE mapParseT #-} 188 189 -------------------------------------------------------------------------------- 190 -- Parse 191 192 type Parse e a = ParseT e a 193 194 runParse# :: Parse e a -> State# -> Result# e a 195 runParse# = runParseT# 196 {-# INLINE runParse# #-} 197 198 runParse :: Parse e a -> ByteString -> Loc -> Result e a 199 runParse = runParseT 200 {-# INLINE runParse #-} 201 202 mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a' 203 mapParse = mapParseT 204 {-# INLINE mapParse #-} 205 206 -------------------------------------------------------------------------------- 207 -- Core combinators 208 209 fail :: ParseT e a 210 fail = ParseT \_s -> Failure# 211 {-# INLINE fail #-} 212 213 err :: e -> ParseT e a 214 err e = ParseT \_s -> Error# e 215 {-# INLINE err #-} 216 217 succeed :: a -> ParseT e a 218 succeed a = ParseT \s -> Success# a s 219 {-# INLINE succeed #-} 220 221 continue :: ParseT f b -> (e -> ParseT f b) -> (a -> ParseT f b) -> ParseT e a -> ParseT f b 222 continue kf ke ks p = ParseT \s -> 223 case p `runParseT#` s of 224 Utf8Error# l -> Utf8Error# l 225 Failure# -> kf `runParseT#` s 226 Error# e -> ke e `runParseT#` s 227 Success# a s' -> ks a `runParseT#` s' 228 {-# INLINE continue #-} 229 230 get# :: (State# -> ParseT e a) -> ParseT e a 231 get# f = ParseT \s -> f s `runParseT#` s 232 {-# INLINE get# #-} 233 234 put# :: State# -> ParseT e a -> ParseT e a 235 put# s p = ParseT \_ -> p `runParseT#` s 236 {-# INLINE put# #-} 237 238 -- | Read the entire input without consuming it. 239 -- getInput :: (Monad m) => ParseT e m ByteString 240 -- getInput = ParseT \s -> pure $ Success# (stateInput# s) s 241 242 -- | Replace the entire input without affecting the current location. 243 -- putInput :: (Monad m) => ByteString -> ParseT e m () 244 -- putInput i = ParseT \(State# _ l) -> pure $ Success# () (State# i l) 245 246 -- modifyInput :: (Monad m) => (ByteString -> ByteString) -> Parse e m () 247 -- modifyInput f = ParseT \(State# i l) -> pure $ Success# () (State (f i) l) 248 249 -- getLoc :: (Monad m) => ParseT e m Loc 250 -- getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s 251 252 -- putLoc :: (Monad m) => Loc -> ParseT e m () 253 -- putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l)) 254 255 -- modifyLoc# :: (Monad m) => (Loc# -> Loc#) -> ParseT e m () 256 -- modifyLoc# f = ParseT \(State# i l) -> pure $ Success# () (State# i (f l)) 257 258 -- modifyLoc :: (Monad m) => (Loc -> Loc) -> ParseT e m () 259 -- modifyLoc f = ParseT \(State# i l) -> pure $ Success# () (State# i (unboxLoc . f . boxLoc $ l)) 260 261 -- save :: (Monad m) => ParseT e m State 262 -- save = ParseT \s -> pure $ Success# (stateBox s) s 263 264 -- load :: (Monad m) => State -> ParseT e m () 265 -- load s = ParseT \_ -> pure $ Success# () (stateUnbox s) 266 267 -------------------------------------------------------------------------------- 268 -- Instances 269 270 instance Functor (ParseT e) where 271 fmap f p = continue fail err (succeed . f) p 272 {-# INLINE fmap #-} 273 a <$ p = continue fail err (const $ succeed a) p 274 {-# INLINE (<$) #-} 275 276 instance Applicative (ParseT e) where 277 pure = succeed 278 {-# INLINE pure #-} 279 p <*> q = continue fail err (<$> q) p 280 {-# INLINE (<*>) #-} 281 p *> q = continue fail err (const q) p 282 {-# INLINE (*>) #-} 283 p <* q = continue fail err (<$ q) p 284 {-# INLINE (<*) #-} 285 286 instance Monad (ParseT e) where 287 return = pure 288 {-# INLINE return #-} 289 p >>= k = continue fail err k p 290 {-# INLINE (>>=) #-} 291 (>>) = (*>) 292 {-# INLINE (>>) #-} 293 294 instance Alternative (ParseT e) where 295 empty = fail 296 {-# INLINE empty #-} 297 p <|> q = continue q err succeed p 298 {-# INLINE (<|>) #-} 299 many p = ParseT go 300 where 301 go s = case p `runParseT#` s of 302 Utf8Error# l -> Utf8Error# l 303 Failure# -> Success# [] s 304 Error# e -> Error# e 305 Success# a s' -> case go s' of 306 Success# as s'' -> Success# (a:as) s'' 307 x -> x 308 {-# INLINE many #-} 309 some p = (:) <$> p <*> many p 310 {-# INLINE some #-} 311 312 -------------------------------------------------------------------------------- 313 -- General combinators 314 315 -- TODO: Boxed variants of functions 316 -- TODO: Non-CPS variants of functions 317 318 -- | Turn a failure into a success and vice versa. 319 not :: ParseT e a -> ParseT e () 320 not = continue (succeed ()) err (const fail) 321 {-# INLINE not #-} 322 323 -- | Convert an error into a failure. 324 try :: ParseT e a -> ParseT e a 325 try = continue fail (const fail) succeed 326 {-# INLINE try #-} 327 328 -- | Convert a failure into an error. 329 cut :: ParseT e a -> e -> ParseT e a 330 p `cut` e = continue (err e) err succeed p 331 {-# INLINE cut #-} 332 333 -- | Catch an error. 334 catch :: ParseT e a -> (e -> ParseT f a) -> ParseT f a 335 p `catch` h = continue fail h succeed p 336 {-# INLINE catch #-} 337 338 -- | `branch p ks kf` runs p; if it succeeds, it continues with ks; if it fails, 339 -- it continues with kf. 340 branch :: ParseT e a -> (a -> ParseT e b) -> ParseT e b -> ParseT e b 341 branch p ks kf = continue kf err ks p 342 {-# INLINE branch #-} 343 344 -- | Run a parser without changing the parser state. 345 lookahead :: ParseT e a -> ParseT e a 346 lookahead p = get# \s -> p <* put# s (pure ()) 347 {-# INLINE lookahead #-} 348 349 withSpan :: (a -> Span# -> ParseT e b) -> ParseT e a -> ParseT e b 350 withSpan k p = 351 get# \(State# _ beg) -> 352 p >>= \a -> 353 get# \(State# _ end) -> 354 k a (Span# beg end) 355 {-# INLINE withSpan #-} 356 357 withConsumed :: (a -> ByteString -> ParseT e b) -> ParseT e a -> ParseT e b 358 withConsumed k p = 359 get# \(State# i (Loc# b0 _ _)) -> 360 p >>= \a -> 361 get# \(State# _ (Loc# b1 _ _)) -> 362 k a (BS.take (I# (b1 -# b0)) i) 363 {-# INLINE withConsumed #-} 364 365 -- | Iterate a parsing function until it fails. I.e., 366 -- @iter f a = pure a >>= f >>= f >>= ...@ 367 -- where the chain of binds is as long as possible without failure. 368 iter :: (a -> ParseT e a) -> a -> ParseT e a 369 iter f a = (f a >>= iter f) <|> pure a 370 {-# INLINE iter #-} 371 372 -- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested 373 -- fashion. 374 chainl :: (b -> a -> b) -> ParseT e b -> ParseT e a -> ParseT e b 375 chainl f pb pa = pb >>= iter \b -> f b <$> pa 376 {-# INLINE chainl #-} 377 378 -- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a 379 -- right-nested fashion. 380 chainr :: (a -> b -> b) -> ParseT e a -> ParseT e b -> ParseT e b 381 chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb 382 {-# INLINE chainr #-} 383 384 many_ :: ParseT e a -> ParseT e () 385 -- many_ p = some_ p <|> pure () 386 many_ p = ParseT go 387 where 388 go s = case p `runParseT#` s of 389 Utf8Error# l -> Utf8Error# l 390 Failure# -> Success# () s 391 Error# e -> Error# e 392 Success# _ s' -> go s' 393 {-# INLINE many_ #-} 394 395 some_ :: ParseT e a -> ParseT e () 396 some_ p = p *> many_ p 397 {-# INLINE some_ #-} 398 399 manySepBy :: ParseT e a -> ParseT e b -> ParseT e [a] 400 manySepBy p sep = someSepBy p sep <|> pure [] 401 {-# INLINE manySepBy #-} 402 403 someSepBy :: ParseT e a -> ParseT e b -> ParseT e [a] 404 someSepBy p sep = (:) <$> p <*> many (sep *> p) 405 {-# INLINE someSepBy #-} 406 407 choose :: (Foldable t) => t (ParseT e a) -> ParseT e a 408 choose = asum 409 {-# INLINE choose #-} 410 411 -- | `match scrut cases` binds scrut to the first non-failing case and returns 412 -- the result. 413 match :: 414 (Functor t, Foldable t) => 415 ParseT e a -> 416 t (a -> ParseT e b) -> 417 ParseT e b 418 match scrut cases = choose $ (scrut >>=) <$> cases 419 {-# INLINE match #-} 420 421 -------------------------------------------------------------------------------- 422 -- Utf8 combinators 423 424 -- TODO: Try writing custom utf8 decoder that doesn't allocate Maybes. 425 -- TODO: Non-CPS version of next? 426 427 next :: ParseT e a -> (Char -> ParseT e a) -> ParseT e a 428 next ke kc = ParseT \s@(State# i l) -> 429 case UTF8.decode i of 430 Just ('\xFFFD', _) -> Utf8Error# l 431 Just (c, w) -> 432 -- TODO: This strictness annotation apparently does nothing. 433 let !i' = BS.drop w i 434 in kc c `runParseT#` State# i' (locAdvance# c w l) 435 Nothing -> ke `runParseT#` s 436 {-# INLINE next #-} 437 438 eof :: ParseT e () 439 eof = get# \(State# i _) -> guard (BS.null i) 440 {-# INLINE eof #-} 441 442 anyChar :: ParseT e Char 443 anyChar = next fail succeed 444 {-# INLINE anyChar #-} 445 446 charIf :: (Char -> Bool) -> ParseT e Char 447 charIf want = next fail (\c -> guard (want c) $> c) 448 {-# INLINE charIf #-} 449 450 char :: Char -> ParseT e () 451 char c = void $ charIf (== c) 452 {-# INLINE char #-} 453 454 string :: String -> ParseT e () 455 string = traverse_ char 456 {-# INLINE string #-} 457 458 natural :: Int -> ParseT e Natural 459 natural = \case 460 0 -> 461 let prefix l u = char '0' *> (char l <|> char u) 462 in prefix 'b' 'B' *> natural 2 463 <|> prefix 'q' 'Q' *> natural 4 464 <|> prefix 'o' 'O' *> natural 8 465 <|> prefix 'x' 'X' *> natural 16 466 <|> natural 10 467 b 468 | 0 < b && b <= 36 -> 469 let nat :: Int -> Natural 470 nat = fromIntegral 471 472 maxn = min (chr $ ord '0' + b) (succ '9') 473 maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') 474 maxl = min (chr $ ord 'a' + b - 10) (succ 'z') 475 476 numeral = 477 anyChar >>= \c -> 478 if 479 | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') 480 | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) 481 | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) 482 | otherwise -> fail 483 484 underscores = many (char '_') 485 in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) 486 _ -> error "natural: invalid base" 487 {-# INLINE natural #-}