Sparsec.hs (18897B)
1 module Sparsec ( 2 -- Char 3 charIsUpperLatin, 4 charIsLowerLatin, 5 charIsLatin, 6 charIsUpperGreek, 7 charIsLowerGreek, 8 charIsGreek, 9 charIsSafeUpperGreek, 10 charIsSafeLowerGreek, 11 charIsSafeGreek, 12 charIsNum2, 13 charIsNum4, 14 charIsNum8, 15 charIsNum10, 16 charIsNum16, 17 -- Rune 18 Rune (..), 19 runeElim, 20 runeIsUpperLatin, 21 runeIsLowerLatin, 22 runeIsLatin, 23 runeIsUpperGreek, 24 runeIsLowerGreek, 25 runeIsGreek, 26 runeIsSafeUpperGreek, 27 runeIsSafeLowerGreek, 28 runeIsSafeGreek, 29 runeIsNum2, 30 runeIsNum4, 31 runeIsNum8, 32 runeIsNum10, 33 runeIsNum16, 34 -- Loc 35 Loc#, 36 pattern Loc#, 37 locByte#, 38 locRow#, 39 locCol#, 40 locEq#, 41 Loc (..), 42 locZero, 43 locBox, 44 locUnbox, 45 -- Span 46 Span#, 47 pattern Span#, 48 spanBeg#, 49 spanEnd#, 50 spanEq#, 51 Span (..), 52 spanBox, 53 spanUnbox, 54 -- State 55 State#, 56 pattern State#, 57 stateInput#, 58 stateLoc#, 59 stateEq#, 60 State (..), 61 stateBox, 62 stateUnbox, 63 -- Result 64 Result# (..), 65 -- Parse monad 66 ParseT (..), 67 runParseT#, 68 runParseT, 69 mapParseT, 70 Parse, 71 runParse#, 72 runParse, 73 mapParse, 74 -- Core combinators 75 fail, 76 err, 77 succeed, 78 continue, 79 read, 80 write, 81 getLoc, 82 putLoc, 83 save, 84 load, 85 -- Derived combinators 86 not, 87 catch, 88 eof, 89 lookahead, 90 spanOf, 91 bytesOf, 92 cut, 93 try, 94 iter, 95 chainl, 96 chainr, 97 branch, 98 choice, 99 match, 100 someSepBy, 101 manySepBy, 102 -- TODO: Byte combinators 103 -- Utf8 combinators 104 Utf8Error (..), 105 nextRune, 106 nextChar, 107 runeIfM, 108 runeIf, 109 charIfM, 110 charIf, 111 runeWhileM, 112 runeWhile, 113 charWhileM, 114 charWhile, 115 rune, 116 char, 117 string, 118 natural, 119 ) where 120 121 import Control.Applicative 122 import Control.Monad hiding (fail) 123 import Control.Monad.Identity (Identity (..)) 124 import Control.Monad.Reader (MonadReader (..)) 125 import Control.Monad.State (MonadState (..)) 126 import Control.Monad.Trans (MonadTrans (..)) 127 import Data.ByteString (ByteString) 128 import Data.ByteString qualified as BS 129 import Data.ByteString.UTF8 qualified as UTF8 130 import Data.Char 131 import Data.Foldable 132 import Data.Functor 133 import GHC.Exts (Int (..), Int#, (==#)) 134 import Numeric.Natural 135 import Text.Printf 136 import Prelude hiding (fail, not, read) 137 138 -------------------------------------------------------------------------------- 139 -- Util 140 141 intEq# :: Int# -> Int# -> Bool 142 intEq# x y = case x ==# y of 143 0# -> False 144 _ -> True 145 146 -------------------------------------------------------------------------------- 147 -- Char 148 149 charIsUpperLatin, charIsLowerLatin, charIsLatin :: Char -> Bool 150 charIsUpperLatin c = 'A' <= c && c <= 'Z' 151 charIsLowerLatin c = 'a' <= c && c <= 'z' 152 charIsLatin c = charIsUpperLatin c || charIsLowerLatin c 153 154 charIsUpperGreek, charIsLowerGreek, charIsGreek :: Char -> Bool 155 charIsUpperGreek c = 'Α' <= c && c <= 'Ω' 156 charIsLowerGreek c = 'α' <= c && c <= 'ω' 157 charIsGreek c = charIsUpperGreek c || charIsLowerGreek c 158 159 charIsSafeUpperGreek, charIsSafeLowerGreek, charIsSafeGreek :: Char -> Bool 160 charIsSafeUpperGreek c = c == 'Γ' || c == 'Δ' || c == 'Θ' || c == 'Λ' || c == 'Ξ' || c == 'Π' || c == 'Σ' || c == 'Φ' || c == 'Ψ' || c == 'Ω' 161 charIsSafeLowerGreek c = charIsLowerGreek c && c /= 'ο' 162 charIsSafeGreek c = charIsSafeUpperGreek c || charIsSafeLowerGreek c 163 164 charIsNum2, charIsNum4, charIsNum8, charIsNum10, charIsNum16 :: Char -> Bool 165 charIsNum2 c = '0' <= c && c <= '1' 166 charIsNum4 c = '0' <= c && c <= '3' 167 charIsNum8 c = '0' <= c && c <= '7' 168 charIsNum10 c = '0' <= c && c <= '9' 169 charIsNum16 c = '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' 170 171 -------------------------------------------------------------------------------- 172 -- Rune 173 -- TODO: Are runes bloat? Should we delete them? 174 175 data Rune = RuneEof | RuneChar Char deriving (Eq) 176 177 instance Show Rune where 178 show = \case 179 RuneEof -> "EOF" 180 RuneChar c -> show c 181 182 runeElim :: a -> (Char -> a) -> Rune -> a 183 runeElim eof chr = \case 184 RuneEof -> eof 185 RuneChar c -> chr c 186 187 runeIsUpperLatin, runeIsLowerLatin, runeIsLatin :: Rune -> Bool 188 runeIsUpperLatin = runeElim False charIsUpperLatin 189 runeIsLowerLatin = runeElim False charIsLowerLatin 190 runeIsLatin = runeElim False charIsLatin 191 192 runeIsUpperGreek, runeIsLowerGreek, runeIsGreek :: Rune -> Bool 193 runeIsUpperGreek = runeElim False charIsUpperGreek 194 runeIsLowerGreek = runeElim False charIsLowerGreek 195 runeIsGreek = runeElim False charIsGreek 196 197 runeIsSafeUpperGreek, runeIsSafeLowerGreek, runeIsSafeGreek :: Rune -> Bool 198 runeIsSafeUpperGreek = runeElim False charIsSafeUpperGreek 199 runeIsSafeLowerGreek = runeElim False charIsSafeLowerGreek 200 runeIsSafeGreek = runeElim False charIsSafeGreek 201 202 runeIsNum2, runeIsNum4, runeIsNum8, runeIsNum10, runeIsNum16 :: Rune -> Bool 203 runeIsNum2 = runeElim False charIsNum2 204 runeIsNum4 = runeElim False charIsNum4 205 runeIsNum8 = runeElim False charIsNum8 206 runeIsNum10 = runeElim False charIsNum10 207 runeIsNum16 = runeElim False charIsNum16 208 209 -------------------------------------------------------------------------------- 210 -- Loc 211 212 newtype Loc# = Loc_# (# Int#, Int#, Int# #) 213 pattern Loc# :: Int# -> Int# -> Int# -> Loc# 214 pattern Loc# b r c = Loc_# (# b, r, c #) 215 {-# COMPLETE Loc# #-} 216 217 locByte#, locRow#, locCol# :: Loc# -> Int# 218 locByte# (Loc# b _ _) = b 219 locRow# (Loc# _ r _) = r 220 locCol# (Loc# _ _ c) = c 221 222 locEq# :: Loc# -> Loc# -> Bool 223 locEq# (Loc# b0 r0 c0) (Loc# b1 r1 c1) = 224 intEq# b0 b1 && intEq# r0 r1 && intEq# c0 c1 225 226 -- | Loc is a source code location, tracking byte offset, row (number of '\n' 227 -- characters seen so far), and column (number of characters into the current 228 -- row). All three of these quantities are 0-based. The column tracking is 229 -- somewhat stupid in that it treats each Unicode codepoint as one column. 230 data Loc = Loc {locByte :: Int, locRow :: Int, locCol :: Int} deriving (Eq) 231 232 locZero :: Loc 233 locZero = Loc 0 0 0 234 235 -- Weird Show instance intended for debugging 236 instance Show Loc where 237 show (Loc b r c) = printf "%d:%d#%d" r c b 238 239 locBox :: Loc# -> Loc 240 locBox (Loc# b r c) = Loc (I# b) (I# r) (I# c) 241 242 locUnbox :: Loc -> Loc# 243 locUnbox (Loc (I# b) (I# r) (I# c)) = Loc# b r c 244 245 -------------------------------------------------------------------------------- 246 -- Span 247 248 newtype Span# = Span_# (# Loc#, Loc# #) 249 pattern Span# :: Loc# -> Loc# -> Span# 250 pattern Span# beg end = Span_# (# beg, end #) 251 {-# COMPLETE Span# #-} 252 253 spanBeg#, spanEnd# :: Span# -> Loc# 254 spanBeg# (Span# beg _) = beg 255 spanEnd# (Span# _ end) = end 256 257 spanEq# :: Span# -> Span# -> Bool 258 spanEq# (Span# beg0 end0) (Span# beg1 end1) = 259 locEq# beg0 beg1 && locEq# end0 end1 260 261 -- | Span is a pair of Loc's specifying a range in the input. The beginning/left 262 -- Loc is inclusive and the ending/right Loc is exclusive. 263 data Span = Span {spanBeg :: Loc, spanEnd :: Loc} deriving (Eq) 264 265 -- Weird Show instance intended for debugging 266 instance Show Span where 267 show (Span beg end) = printf "%s--%s" (show beg) (show end) 268 269 spanBox :: Span# -> Span 270 spanBox (Span# beg end) = Span (locBox beg) (locBox end) 271 272 spanUnbox :: Span -> Span# 273 spanUnbox (Span beg end) = Span# (locUnbox beg) (locUnbox end) 274 275 -------------------------------------------------------------------------------- 276 -- State 277 278 newtype State# = State_# (# ByteString, Loc# #) 279 pattern State# :: ByteString -> Loc# -> State# 280 pattern State# i l = State_# (# i, l #) 281 {-# COMPLETE State# #-} 282 283 stateInput# :: State# -> ByteString 284 stateInput# (State# i _) = i 285 286 stateLoc# :: State# -> Loc# 287 stateLoc# (State# _ l) = l 288 289 stateEq# :: State# -> State# -> Bool 290 stateEq# (State# i0 l0) (State# i1 l1) = i0 == i1 && locEq# l0 l1 291 292 data State = State {stateInput :: ByteString, stateLoc :: Loc} deriving (Eq) 293 294 stateBox :: State# -> State 295 stateBox (State# i l) = State i (locBox l) 296 297 stateUnbox :: State -> State# 298 stateUnbox (State i l) = State# i (locUnbox l) 299 300 -------------------------------------------------------------------------------- 301 -- Result 302 303 -- TODO: Result (no hash) 304 305 data Result# e a = Failure# | Error# e | Success# a State# 306 307 instance (Eq e, Eq a) => Eq (Result# e a) where 308 Failure# == Failure# = True 309 Error# e0 == Error# e1 = e0 == e1 310 Success# a0 s0 == Success# a1 s1 = a0 == a1 && stateEq# s0 s1 311 _ == _ = False 312 313 instance (Show e, Show a) => Show (Result# e a) where 314 show = \case 315 Failure# -> "failure" 316 Error# e -> "error: " ++ show e 317 Success# a (State# i _) -> 318 if BS.null i 319 then 320 printf "success: %s" (show a) 321 else 322 printf 323 "success (%d bytes remaining): %s" 324 (BS.length i) 325 (show a) 326 327 instance Functor (Result# e) where 328 fmap f = \case 329 Failure# -> Failure# 330 Error# e -> Error# e 331 Success# a s -> Success# (f a) s 332 333 -------------------------------------------------------------------------------- 334 -- ParseT 335 336 -- | ParseT is a monad transformer for parsing. It effectively has a ByteString 337 -- and Loc state effect for the input, and an error effect (of some type e) for 338 -- parsing errors. Parsing *errors* are distinct from parsing *failures* in 339 -- that only the latter trigger backtracking in the Alternative instance. 340 newtype ParseT e m a = ParseT (State# -> m (Result# e a)) 341 342 runParseT# :: (Monad m) => ParseT e m a -> State# -> m (Result# e a) 343 runParseT# (ParseT f) = f 344 345 runParseT :: (Monad m) => ParseT e m a -> ByteString -> Loc -> m (Result# e a) 346 runParseT p i l = p `runParseT#` State# i (locUnbox l) 347 348 mapParseT :: 349 (Monad m, Monad m') => 350 (m (Result# e a) -> m' (Result# e' a')) -> 351 ParseT e m a -> 352 ParseT e' m' a' 353 mapParseT f p = ParseT \s -> f $ p `runParseT#` s 354 355 -------------------------------------------------------------------------------- 356 -- Parse 357 358 type Parse e a = ParseT e Identity a 359 360 runParse# :: Parse e a -> State# -> Result# e a 361 runParse# p s = runIdentity $ p `runParseT#` s 362 363 runParse :: Parse e a -> ByteString -> Loc -> Result# e a 364 runParse p i l = p `runParse#` State# i (locUnbox l) 365 366 mapParse :: (Result# e a -> Result# e' a') -> Parse e a -> Parse e' a' 367 mapParse f = mapParseT (Identity . f . runIdentity) 368 369 -------------------------------------------------------------------------------- 370 -- Core combinators 371 372 fail :: (Monad m) => ParseT e m a 373 fail = ParseT \_ -> pure Failure# 374 375 err :: (Monad m) => e -> ParseT e m a 376 err e = ParseT \_ -> pure $ Error# e 377 378 succeed :: (Monad m) => a -> ParseT e m a 379 succeed a = ParseT \s -> pure $ Success# a s 380 381 continue :: (Monad m) => ParseT e' m a' -> (e -> ParseT e' m a') -> (a -> ParseT e' m a') -> ParseT e m a -> ParseT e' m a' 382 continue kf ke ks p = ParseT \s -> 383 p `runParseT#` s >>= \case 384 Failure# -> kf `runParseT#` s 385 Error# e -> ke e `runParseT#` s 386 Success# a s' -> ks a `runParseT#` s' 387 388 -- | Read the entire input without consuming it. 389 read :: (Monad m) => ParseT e m ByteString 390 read = ParseT \s -> pure $ Success# (stateInput# s) s 391 392 -- | Replace the entire input without affecting the current location. 393 write :: (Monad m) => ByteString -> ParseT e m () 394 write i = ParseT \(State# _ l) -> pure $ Success# () (State# i l) 395 396 getLoc :: (Monad m) => ParseT e m Loc 397 getLoc = ParseT \s -> pure $ Success# (locBox $ stateLoc# s) s 398 399 putLoc :: (Monad m) => Loc -> ParseT e m () 400 putLoc l = ParseT \(State# i _) -> pure $ Success# () (State# i (locUnbox l)) 401 402 save :: (Monad m) => ParseT e m State 403 save = ParseT \s -> pure $ Success# (stateBox s) s 404 405 load :: (Monad m) => State -> ParseT e m () 406 load s = ParseT \_ -> pure $ Success# () (stateUnbox s) 407 408 -------------------------------------------------------------------------------- 409 -- Instances 410 411 instance (Monad m) => Functor (ParseT e m) where 412 fmap = liftM 413 414 instance (Monad m) => Applicative (ParseT e m) where 415 pure = succeed 416 (<*>) = ap 417 418 instance (Monad m) => Monad (ParseT e m) where 419 p >>= k = continue fail err k p 420 421 instance (Monad m) => Alternative (ParseT e m) where 422 empty = fail 423 p <|> q = continue q err succeed p 424 425 instance MonadTrans (ParseT e) where 426 lift m = ParseT \s -> (`Success#` s) <$> m 427 428 instance (MonadReader r m) => MonadReader r (ParseT e m) where 429 ask = lift ask 430 local f p = ParseT \s -> local f $ p `runParseT#` s 431 432 instance (MonadState s m) => MonadState s (ParseT e m) where 433 get = lift get 434 put = lift . put 435 436 -------------------------------------------------------------------------------- 437 -- General combinators 438 439 -- | Turn a failure into a success and vice versa. 440 not :: (Monad m) => ParseT e m a -> ParseT e m () 441 not = continue (succeed ()) err (const fail) 442 443 -- | Catch an error. 444 catch :: (Monad m) => ParseT e m a -> (e -> ParseT e' m a) -> ParseT e' m a 445 p `catch` h = continue fail h succeed p 446 447 eof :: (Monad m) => ParseT e m () 448 eof = guard . BS.null =<< read 449 450 -- | Run a parser without changing the parser state. 451 lookahead :: (Monad m) => ParseT e m a -> ParseT e m a 452 lookahead p = do 453 s <- save 454 a <- p 455 load s 456 pure a 457 458 spanOf :: (Monad m) => ParseT e m a -> ParseT e m (a, Span) 459 spanOf p = do 460 beg <- getLoc 461 a <- p 462 end <- getLoc 463 pure (a, Span beg end) 464 465 bytesOf :: (Monad m) => ParseT e m a -> ParseT e m (a, ByteString) 466 bytesOf p = do 467 State i beg <- save 468 a <- p 469 end <- getLoc 470 pure (a, BS.take (locByte end - locByte beg) i) 471 472 -- | Convert a failure into an error. 473 -- 474 -- TODO: Why do we call this "cut"? 475 cut :: (Monad m) => ParseT e m a -> e -> ParseT e m a 476 p `cut` e = p <|> err e 477 478 -- | Convert an error into a failure. 479 try :: (Monad m) => ParseT e m a -> ParseT e m a 480 try p = p `catch` const fail 481 482 -- | Iterate a parsing function until it fails. I.e., 483 -- @iter f a = pure a >>= f >>= f >>= ...@ 484 -- where the chain of binds is as long as possible without failure. 485 iter :: (Monad m) => (a -> ParseT e m a) -> a -> ParseT e m a 486 iter f a = (f a >>= iter f) <|> pure a 487 488 -- | Parse a `b` and then zero or more `a`s, and then combine in a left-nested 489 -- fashion. 490 chainl :: (Monad m) => (b -> a -> b) -> ParseT e m b -> ParseT e m a -> ParseT e m b 491 chainl f pb pa = pb >>= iter \b -> f b <$> pa 492 493 -- | Parse zero or more `a`s (greedily) and then a `b`, and then combine in a 494 -- right-nested fashion. 495 chainr :: (Monad m) => (a -> b -> b) -> ParseT e m a -> ParseT e m b -> ParseT e m b 496 chainr f pa pb = f <$> pa <*> chainr f pa pb <|> pb 497 498 -- | `branch pc pt pf` runs pc; if it succeeds, it continues with pt, and 499 -- otherwise it continues with pf. 500 branch :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m b -> ParseT e m b 501 branch p kt kf = continue kf err (const kt) p 502 503 choice :: (Monad m, Foldable t) => t (ParseT e m a) -> ParseT e m a 504 choice = asum 505 506 -- | `match scrut cases` binds scrut to the first non-failing case and returns 507 -- the result. 508 match :: 509 (Monad m, Functor t, Foldable t) => 510 ParseT e m a -> 511 t (a -> ParseT e m b) -> 512 ParseT e m b 513 match scrut cases = choice $ (scrut >>=) <$> cases 514 515 someSepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] 516 someSepBy p sep = (:) <$> p <*> many (sep *> p) 517 518 manySepBy :: (Monad m) => ParseT e m a -> ParseT e m b -> ParseT e m [a] 519 manySepBy p sep = someSepBy p sep <|> pure [] 520 521 -------------------------------------------------------------------------------- 522 -- Byte-wise parse combinators 523 524 -- TODO 525 526 -------------------------------------------------------------------------------- 527 -- Utf8 parse combinators 528 529 -- All these combinators involve UTF-8 decoding and therefore require that 530 -- the error type e supports UTF-8 errors. 531 532 class Utf8Error e where 533 utf8Error :: Loc -> e 534 535 nextRune :: forall e m. (Utf8Error e, Monad m) => ParseT e m Rune 536 nextRune = toRune . UTF8.decode =<< read 537 where 538 toRune :: Maybe (Char, Int) -> ParseT e m Rune 539 toRune = \case 540 Just (c, w) 541 | c == UTF8.replacement_char -> err . utf8Error =<< getLoc 542 | otherwise -> updateState c w $> RuneChar c 543 Nothing -> pure RuneEof 544 545 updateState :: Char -> Int -> ParseT e m () 546 updateState c w = do 547 write . BS.drop w =<< read 548 loc <- getLoc 549 putLoc case c of 550 '\n' -> Loc (locByte loc + w) (locRow loc + 1) 0 551 _ -> Loc (locByte loc + w) (locRow loc) (locCol loc + 1) 552 553 nextChar :: (Utf8Error e, Monad m) => ParseT e m Char 554 nextChar = 555 nextRune >>= \case 556 RuneEof -> fail 557 RuneChar c -> pure c 558 559 runeIfM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m Rune 560 runeIfM want = do 561 r <- nextRune 562 lift (want r) >>= guard 563 pure r 564 565 runeIf :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m Rune 566 runeIf want = runeIfM (pure . want) 567 568 charIfM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m Char 569 charIfM want = do 570 c <- nextChar 571 lift (want c) >>= guard 572 pure c 573 574 charIf :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m Char 575 charIf want = charIfM (pure . want) 576 577 runeWhileM :: (Utf8Error e, Monad m) => (Rune -> m Bool) -> ParseT e m ByteString 578 runeWhileM want = snd <$> bytesOf (many (runeIfM want)) 579 580 runeWhile :: (Utf8Error e, Monad m) => (Rune -> Bool) -> ParseT e m ByteString 581 runeWhile want = runeWhileM (pure . want) 582 583 charWhileM :: (Utf8Error e, Monad m) => (Char -> m Bool) -> ParseT e m ByteString 584 charWhileM want = runeWhileM \case 585 RuneEof -> pure False 586 RuneChar c -> want c 587 588 charWhile :: (Utf8Error e, Monad m) => (Char -> Bool) -> ParseT e m ByteString 589 charWhile want = charWhileM (pure . want) 590 591 rune :: (Utf8Error e, Monad m) => Rune -> ParseT e m () 592 rune r = void $ runeIf (== r) 593 594 char :: (Utf8Error e, Monad m) => Char -> ParseT e m () 595 char c = void $ charIf (== c) 596 597 string :: (Utf8Error e, Monad m) => String -> ParseT e m () 598 string = traverse_ char 599 600 natural :: (Utf8Error e, Monad m) => Int -> ParseT e m Natural 601 natural = \case 602 0 -> 603 let prefix l u = char '0' *> (char l <|> char u) 604 in prefix 'b' 'B' *> natural 2 605 <|> prefix 'q' 'Q' *> natural 4 606 <|> prefix 'o' 'O' *> natural 8 607 <|> prefix 'x' 'X' *> natural 16 608 <|> natural 10 609 b 610 | 0 < b && b <= 36 -> 611 let nat :: Int -> Natural 612 nat = fromIntegral 613 614 maxn = min (chr $ ord '0' + b) (succ '9') 615 maxu = min (chr $ ord 'A' + b - 10) (succ 'Z') 616 maxl = min (chr $ ord 'a' + b - 10) (succ 'z') 617 618 numeral = 619 nextChar >>= \c -> 620 if 621 | '0' <= c && c < maxn -> pure $ nat (ord c - ord '0') 622 | 'A' <= c && c < maxu -> pure $ nat (ord c - ord 'A' + 10) 623 | 'a' <= c && c < maxl -> pure $ nat (ord c - ord 'a' + 10) 624 | otherwise -> fail 625 626 underscores = many (char '_') 627 in numeral >>= iter (\n -> (nat b * n +) <$> (underscores *> numeral)) 628 _ -> error "natural: invalid base"