diff options
| author | Albert Krewinkel <[email protected]> | 2022-03-11 18:21:59 +0100 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-03-11 09:21:59 -0800 |
| commit | 517bceeba81cf645e428e217d1cc518a8fcd34d7 (patch) | |
| tree | 70c84ff4dade9b536ee217cac93c78e6c1aba263 /src | |
| parent | 168529f0a4b7f2208f7a6d817d3b972d8b387570 (diff) | |
Parsing: partition module into (internal) submodules (#7962)
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Parsing.hs | 1482 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Capabilities.hs | 137 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Citations.hs | 57 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Combinators.hs | 751 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/GridTable.hs | 217 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Lists.hs | 215 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Math.hs | 96 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Smart.hs | 145 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/State.hs | 191 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Types.hs | 57 |
10 files changed, 1876 insertions, 1472 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index d007dfc17..b663f7fa9 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -1,11 +1,3 @@ -{-# LANGUAGE CPP #-} -{-# LANGUAGE ExplicitForAll #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE IncoherentInstances #-} -{-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Parsing Copyright : Copyright (C) 2006-2022 John MacFarlane @@ -175,77 +167,9 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources, ) where -import Control.Monad.Identity - ( guard, - join, - unless, - when, - void, - liftM2, - liftM, - Identity(..), - MonadPlus(mzero) ) -import Control.Monad.Reader - ( asks, runReader, MonadReader(ask), Reader, ReaderT(ReaderT) ) -import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, isAsciiLower, - isSpace, ord, toLower, toUpper) -import Data.Default ( Default(..) ) -import Data.Functor (($>)) -import Data.List (intercalate, transpose) -import qualified Data.Map as M -import Data.Maybe (fromMaybe) -import qualified Data.Set as Set -import Data.Text (Text) -import qualified Data.Text as T -import Text.HTML.TagSoup.Entity (lookupEntity) -import Text.Pandoc.Asciify (toAsciiText) -import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report) -import Text.Pandoc.Definition - ( Target, - nullMeta, - nullAttr, - Meta, - ColWidth(ColWidthDefault, ColWidth), - TableFoot(TableFoot), - TableBody(TableBody), - Attr, - TableHead(TableHead), - Row(..), - Alignment(..), - Inline(Str), - ListNumberDelim(..), - ListAttributes, - ListNumberStyle(..) ) -import Text.Pandoc.Logging - ( LogMessage(CouldNotLoadIncludeFile, DuplicateIdentifier) ) -import Text.Pandoc.Options - ( extensionEnabled, - Extension(Ext_old_dashes, Ext_tex_math_dollars, - Ext_tex_math_single_backslash, Ext_tex_math_double_backslash, - Ext_auto_identifiers, Ext_ascii_identifiers, Ext_smart), - ReaderOptions(readerTabStop, readerColumns, readerExtensions) ) -import Text.Pandoc.Readers.LaTeX.Types (Macro) -import Text.Pandoc.Shared - ( uniqueIdent, - tshow, - mapLeft, - compactify, - trim, - trimr, - splitTextByIndices, - safeRead, - trimMath, - schemes, - escapeURI ) import Text.Pandoc.Sources -import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) -import Text.Pandoc.XML (fromEntities) import Text.Parsec - ( between, - setSourceName, - Parsec, + ( setSourceName, Column, Line, incSourceLine, @@ -292,1401 +216,15 @@ import Text.Parsec runParserT, runParser, ParseError, - ParsecT, SourcePos, Stream(..) ) import Text.Parsec.Pos (initialPos, newPos) -import Control.Monad.Except ( MonadError(throwError) ) -import Text.Pandoc.Error - ( PandocError(PandocParseError, PandocParsecError) ) - -type Parser t s = Parsec t s - -type ParserT = ParsecT - - --- | Reader monad wrapping the parser state. This is used to possibly delay --- evaluation until all relevant information has been parsed and made available --- in the parser state. -newtype Future s a = Future { runDelayed :: Reader s a } - deriving (Monad, Applicative, Functor) - -runF :: Future s a -> s -> a -runF = runReader . runDelayed - -askF :: Future s s -askF = Future ask - -asksF :: (s -> a) -> Future s a -asksF f = Future $ asks f - -returnF :: Monad m => a -> m (Future s a) -returnF = return . return - -trimInlinesF :: Future s Inlines -> Future s Inlines -trimInlinesF = liftM trimInlines - -instance Semigroup a => Semigroup (Future s a) where - (<>) = liftM2 (<>) -instance (Semigroup a, Monoid a) => Monoid (Future s a) where - mempty = return mempty - mappend = (<>) - --- | Like @count@, but packs its result -countChar :: (Stream s m Char, UpdateSourcePos s Char, Monad m) - => Int - -> ParsecT s st m Char - -> ParsecT s st m Text -countChar n = fmap T.pack . count n - --- | Like @string@, but uses @Text@. -textStr :: (Stream s m Char, UpdateSourcePos s Char) - => Text -> ParsecT s u m Text -textStr t = string (T.unpack t) $> t - - --- | Parse any line of text, returning the contents without the --- final newline. -anyLine :: Monad m => ParserT Sources st m Text -anyLine = do - -- This is much faster than: - -- manyTill anyChar newline - inp <- getInput - case inp of - Sources [] -> mzero - Sources ((fp,t):inps) -> - -- we assume that lines don't span different input files - case T.break (=='\n') t of - (this, rest) - | T.null rest - , not (null inps) -> - -- line may span different input files, so do it - -- character by character - T.pack <$> manyTill anyChar newline - | otherwise -> do -- either end of inputs or newline in rest - setInput $ Sources ((fp, rest):inps) - char '\n' -- needed so parsec knows we won't match empty string - -- and so source pos is updated - return this - --- | Parse any line, include the final newline in the output -anyLineNewline :: Monad m => ParserT Sources st m Text -anyLineNewline = (<> "\n") <$> anyLine - --- | Parse indent by specified number of spaces (or equiv. tabs) -indentWith :: (Stream s m Char, UpdateSourcePos s Char) - => HasReaderOptions st - => Int -> ParserT s st m Text -indentWith num = do - tabStop <- getOption readerTabStop - if num < tabStop - then countChar num (char ' ') - else choice [ try (countChar num (char ' ')) - , try (char '\t' >> indentWith (num - tabStop)) ] - --- | Like @many@, but packs its result. -manyChar :: Stream s m t - => ParserT s st m Char - -> ParserT s st m Text -manyChar = fmap T.pack . many - --- | Like @many1@, but packs its result. -many1Char :: Stream s m t - => ParserT s st m Char - -> ParserT s st m Text -many1Char = fmap T.pack . many1 - --- | Like @manyTill@, but packs its result. -manyTillChar :: Stream s m t - => ParserT s st m Char - -> ParserT s st m a - -> ParserT s st m Text -manyTillChar p = fmap T.pack . manyTill p - --- | Like @manyTill@, but reads at least one item. -many1Till :: (Show end, Stream s m t) - => ParserT s st m a - -> ParserT s st m end - -> ParserT s st m [a] -many1Till p end = do - notFollowedBy' end - first <- p - rest <- manyTill p end - return (first:rest) - --- | Like @many1Till@, but packs its result -many1TillChar :: (Show end, Stream s m t) - => ParserT s st m Char - -> ParserT s st m end - -> ParserT s st m Text -many1TillChar p = fmap T.pack . many1Till p - --- | Like @manyTill@, but also returns the result of end parser. -manyUntil :: ParserT s u m a - -> ParserT s u m b - -> ParserT s u m ([a], b) -manyUntil p end = scan - where scan = - (do e <- end - return ([], e) - ) <|> - (do x <- p - (xs, e) <- scan - return (x:xs, e)) - --- | Like @manyUntil@, but also packs its result. -manyUntilChar :: ParserT s u m Char - -> ParserT s u m b - -> ParserT s u m (Text, b) -manyUntilChar p = fmap go . manyUntil p - where - go (x, y) = (T.pack x, y) - --- | Like @sepBy1@ from Parsec, --- but does not fail if it @sep@ succeeds and @p@ fails. -sepBy1' :: ParsecT s u m a - -> ParsecT s u m sep - -> ParsecT s u m [a] -sepBy1' p sep = (:) <$> p <*> many (try $ sep >> p) - --- | A more general form of @notFollowedBy@. This one allows any --- type of parser to be specified, and succeeds only if that parser fails. --- It does not consume any input. -notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m () -notFollowedBy' p = try $ join $ do a <- try p - return (unexpected (show a)) - <|> - return (return ()) --- (This version due to Andrew Pimlott on the Haskell mailing list.) - -oneOfStrings' :: (Stream s m Char, UpdateSourcePos s Char) - => (Char -> Char -> Bool) -> [Text] -> ParserT s st m Text -oneOfStrings' f = fmap T.pack . oneOfStrings'' f . fmap T.unpack - --- TODO: This should be re-implemented in a Text-aware way -oneOfStrings'' :: (Stream s m Char, UpdateSourcePos s Char) - => (Char -> Char -> Bool) -> [String] -> ParserT s st m String -oneOfStrings'' _ [] = Prelude.fail "no strings" -oneOfStrings'' matches strs = try $ do - c <- anyChar - let strs' = [xs | (x:xs) <- strs, x `matches` c] - case strs' of - [] -> Prelude.fail "not found" - _ -> (c:) <$> oneOfStrings'' matches strs' - <|> if "" `elem` strs' - then return [c] - else Prelude.fail "not found" - --- | Parses one of a list of strings. If the list contains --- two strings one of which is a prefix of the other, the longer --- string will be matched if possible. -oneOfStrings :: (Stream s m Char, UpdateSourcePos s Char) - => [Text] -> ParserT s st m Text -oneOfStrings = oneOfStrings' (==) - --- | Parses one of a list of strings (tried in order), case insensitive. - --- TODO: This will not be accurate with general Unicode (neither --- Text.toLower nor Text.toCaseFold can be implemented with a map) -oneOfStringsCI :: (Stream s m Char, UpdateSourcePos s Char) - => [Text] -> ParserT s st m Text -oneOfStringsCI = oneOfStrings' ciMatch - where ciMatch x y = toLower' x == toLower' y - -- this optimizes toLower by checking common ASCII case - -- first, before calling the expensive unicode-aware - -- function: - toLower' c | isAsciiUpper c = chr (ord c + 32) - | isAscii c = c - | otherwise = toLower c - --- | Parses a space or tab. -spaceChar :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Char -spaceChar = satisfy $ \c -> c == ' ' || c == '\t' - --- | Parses a nonspace, nonnewline character. -nonspaceChar :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Char -nonspaceChar = satisfy (not . isSpaceChar) - -isSpaceChar :: Char -> Bool -isSpaceChar ' ' = True -isSpaceChar '\t' = True -isSpaceChar '\n' = True -isSpaceChar '\r' = True -isSpaceChar _ = False - --- | Skips zero or more spaces or tabs. -skipSpaces :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m () -skipSpaces = skipMany spaceChar - --- | Skips zero or more spaces or tabs, then reads a newline. -blankline :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Char -blankline = try $ skipSpaces >> newline - --- | Parses one or more blank lines and returns a string of newlines. -blanklines :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Text -blanklines = T.pack <$> many1 blankline - --- | Gobble n spaces; if tabs are encountered, expand them --- and gobble some or all of their spaces, leaving the rest. -gobbleSpaces :: (HasReaderOptions st, Monad m) - => Int -> ParserT Sources st m () -gobbleSpaces 0 = return () -gobbleSpaces n - | n < 0 = error "gobbleSpaces called with negative number" - | otherwise = try $ do - char ' ' <|> eatOneSpaceOfTab - gobbleSpaces (n - 1) - -eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT Sources st m Char -eatOneSpaceOfTab = do - lookAhead (char '\t') - pos <- getPosition - tabstop <- getOption readerTabStop - -- replace the tab on the input stream with spaces - let numSpaces = tabstop - ((sourceColumn pos - 1) `mod` tabstop) - inp <- getInput - setInput $ - case inp of - Sources [] -> error "eatOneSpaceOfTab - empty Sources list" - Sources ((fp,t):rest) -> - -- drop the tab and add spaces - Sources ((fp, T.replicate numSpaces " " <> T.drop 1 t):rest) - char ' ' - --- | Gobble up to n spaces; if tabs are encountered, expand them --- and gobble some or all of their spaces, leaving the rest. -gobbleAtMostSpaces :: (HasReaderOptions st, Monad m) - => Int -> ParserT Sources st m Int -gobbleAtMostSpaces 0 = return 0 -gobbleAtMostSpaces n - | n < 0 = error "gobbleAtMostSpaces called with negative number" - | otherwise = option 0 $ do - char ' ' <|> eatOneSpaceOfTab - (+ 1) <$> gobbleAtMostSpaces (n - 1) - --- | Parses material enclosed between start and end parsers. -enclosed :: (Show end, Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m t -- ^ start parser - -> ParserT s st m end -- ^ end parser - -> ParserT s st m a -- ^ content parser (to be used repeatedly) - -> ParserT s st m [a] -enclosed start end parser = try $ - start >> notFollowedBy space >> many1Till parser end - --- | Parse string, case insensitive. -stringAnyCase :: (Stream s m Char, UpdateSourcePos s Char) - => Text -> ParserT s st m Text -stringAnyCase = fmap T.pack . stringAnyCase' . T.unpack - -stringAnyCase' :: (Stream s m Char, UpdateSourcePos s Char) - => String -> ParserT s st m String -stringAnyCase' [] = string "" -stringAnyCase' (x:xs) = do - firstChar <- char (toUpper x) <|> char (toLower x) - rest <- stringAnyCase' xs - return (firstChar:rest) - --- TODO rewrite by just adding to Sources stream? --- | Parse contents of 'str' using 'parser' and return result. -parseFromString :: Monad m - => ParserT Sources st m r - -> Text - -> ParserT Sources st m r -parseFromString parser str = do - oldPos <- getPosition - oldInput <- getInput - setInput $ toSources str - setPosition $ initialPos $ sourceName oldPos <> "_chunk" - result <- parser - spaces - setInput oldInput - setPosition oldPos - return result - --- | Like 'parseFromString' but specialized for 'ParserState'. --- This resets 'stateLastStrPos', which is almost always what we want. -parseFromString' :: (Monad m, HasLastStrPosition u) - => ParserT Sources u m a - -> Text - -> ParserT Sources u m a -parseFromString' parser str = do - oldLastStrPos <- getLastStrPos <$> getState - updateState $ setLastStrPos Nothing - res <- parseFromString parser str - updateState $ setLastStrPos oldLastStrPos - return res - --- | Parse raw line block up to and including blank lines. -lineClump :: Monad m => ParserT Sources st m Text -lineClump = blanklines - <|> (T.unlines <$> many1 (notFollowedBy blankline >> anyLine)) - --- | Parse a string of characters between an open character --- and a close character, including text between balanced --- pairs of open and close, which must be different. For example, --- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" --- and return "hello (there)". -charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParserT s st m Char - -> ParserT s st m Text -charsInBalanced open close parser = try $ do - char open - let isDelim c = c == open || c == close - raw <- many $ T.pack <$> many1 (notFollowedBy (satisfy isDelim) >> parser) - <|> (do res <- charsInBalanced open close parser - return $ T.singleton open <> res <> T.singleton close) - char close - return $ T.concat raw - --- old charsInBalanced would be: --- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline) --- old charsInBalanced' would be: --- charsInBalanced open close anyChar - --- Auxiliary functions for romanNumeral: - --- | Parses a roman numeral (uppercase or lowercase), returns number. -romanNumeral :: (Stream s m Char, UpdateSourcePos s Char) => Bool -- ^ Uppercase if true - -> ParserT s st m Int -romanNumeral upperCase = do - let rchar uc = char $ if upperCase then uc else toLower uc - let one = rchar 'I' - let five = rchar 'V' - let ten = rchar 'X' - let fifty = rchar 'L' - let hundred = rchar 'C' - let fivehundred = rchar 'D' - let thousand = rchar 'M' - lookAhead $ choice [one, five, ten, fifty, hundred, fivehundred, thousand] - thousands <- (1000 *) . length <$> many thousand - ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 - fivehundreds <- option 0 $ 500 <$ fivehundred - fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 - hundreds <- (100 *) . length <$> many hundred - nineties <- option 0 $ try $ ten >> hundred >> return 90 - fifties <- option 0 (50 <$ fifty) - forties <- option 0 $ try $ ten >> fifty >> return 40 - tens <- (10 *) . length <$> many ten - nines <- option 0 $ try $ one >> ten >> return 9 - fives <- option 0 (5 <$ five) - fours <- option 0 $ try $ one >> five >> return 4 - ones <- length <$> many one - let total = thousands + ninehundreds + fivehundreds + fourhundreds + - hundreds + nineties + fifties + forties + tens + nines + - fives + fours + ones - if total == 0 - then Prelude.fail "not a roman numeral" - else return total - --- Parsers for email addresses and URIs - --- | Parses an email address; returns original and corresponding --- escaped mailto: URI. -emailAddress :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) -emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) - where toResult mbox dom = let full = fromEntities $ T.pack $ mbox ++ '@':dom - in (full, escapeURI $ "mailto:" <> full) - mailbox = intercalate "." <$> (emailWord `sepBy1'` dot) - domain = intercalate "." <$> (subdomain `sepBy1'` dot) - dot = char '.' - subdomain = many1 $ alphaNum <|> innerPunct (=='-') - -- this excludes some valid email addresses, since an - -- email could contain e.g. '__', but gives better results - -- for our purposes, when combined with markdown parsing: - innerPunct f = try (satisfy f - <* notFollowedBy (satisfy (not . isAlphaNum))) - -- technically an email address could begin with a symbol, - -- but allowing this creates too many problems. - -- See e.g. https://github.com/jgm/pandoc/issues/2940 - emailWord = do x <- satisfy isAlphaNum - xs <- many (satisfy isEmailChar) - return (x:xs) - isEmailChar c = isAlphaNum c || isEmailPunct c - isEmailPunct c = T.any (== c) "!\"#$%&'*+-/=?^_{|}~;" - - -uriScheme :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text -uriScheme = oneOfStringsCI (Set.toList schemes) - --- | Parses a URI. Returns pair of original and URI-escaped version. -uri :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) -uri = try $ do - scheme <- uriScheme - char ':' - -- Avoid parsing e.g. "**Notes:**" as a raw URI: - notFollowedBy $ satisfy (\c -> c == '*' || c == '_' || c == ']') - -- We allow sentence punctuation except at the end, since - -- we don't want the trailing '.' in 'http://google.com.' We want to allow - -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) - -- as a URL, while NOT picking up the closing paren in - -- (http://wikipedia.org). So we include balanced parens in the URL. - str <- T.concat <$> many1 (uriChunkBetween '(' ')' - <|> uriChunkBetween '{' '}' - <|> uriChunkBetween '[' ']' - <|> T.pack <$> uriChunk) - str' <- option str $ char '/' >> return (str <> "/") - let uri' = scheme <> ":" <> fromEntities str' - return (uri', escapeURI uri') - where - isWordChar '#' = True - isWordChar '$' = True - isWordChar '%' = True - isWordChar '+' = True - isWordChar '/' = True - isWordChar '@' = True - isWordChar '\\' = True - isWordChar '_' = True - isWordChar '-' = True - isWordChar '&' = True - isWordChar '=' = True - isWordChar c = isAlphaNum c - - wordChar = satisfy isWordChar - percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit - entity = try $ pure <$> characterReference - punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) - uriChunk = many1 wordChar - <|> percentEscaped - <|> entity - <|> try (punct <* lookAhead (void wordChar <|> void percentEscaped)) - uriChunkBetween l r = try $ do chunk <- between (char l) (char r) uriChunk - return (T.pack $ [l] ++ chunk ++ [r]) - -mathInlineWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text -mathInlineWith op cl = try $ do - textStr op - when (op == "$") $ notFollowedBy space - words' <- many1Till ( - (T.singleton <$> - satisfy (\c -> not (isSpaceChar c || c == '\\'))) - <|> (char '\\' >> - -- This next clause is needed because \text{..} can - -- contain $, \(\), etc. - (try (string "text" >> - (("\\text" <>) <$> inBalancedBraces 0 "")) - <|> (\c -> T.pack ['\\',c]) <$> anyChar)) - <|> do (blankline <* notFollowedBy' blankline) <|> - (spaceChar <* skipMany spaceChar) - notFollowedBy (char '$') - return " " - ) (try $ textStr cl) - notFollowedBy digit -- to prevent capture of $5 - return $ trimMath $ T.concat words' - where - inBalancedBraces :: (Stream s m Char, UpdateSourcePos s Char) => Int -> Text -> ParserT s st m Text - inBalancedBraces n = fmap T.pack . inBalancedBraces' n . T.unpack - - inBalancedBraces' :: (Stream s m Char, UpdateSourcePos s Char) => Int -> String -> ParserT s st m String - inBalancedBraces' 0 "" = do - c <- anyChar - if c == '{' - then inBalancedBraces' 1 "{" - else mzero - inBalancedBraces' 0 s = return $ reverse s - inBalancedBraces' numOpen ('\\':xs) = do - c <- anyChar - inBalancedBraces' numOpen (c:'\\':xs) - inBalancedBraces' numOpen xs = do - c <- anyChar - case c of - '}' -> inBalancedBraces' (numOpen - 1) (c:xs) - '{' -> inBalancedBraces' (numOpen + 1) (c:xs) - _ -> inBalancedBraces' numOpen (c:xs) - -mathDisplayWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text -mathDisplayWith op cl = try $ fmap T.pack $ do - textStr op - many1Till (satisfy (/= '\n') <|> (newline <* notFollowedBy' blankline)) - (try $ textStr cl) - -mathDisplay :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Text -mathDisplay = - (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathDisplayWith "\\[" "\\]") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathDisplayWith "\\\\[" "\\\\]") - -mathInline :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Text -mathInline = - (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathInlineWith "\\(" "\\)") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathInlineWith "\\\\(" "\\\\)") - --- | Applies a parser, returns tuple of its results and its horizontal --- displacement (the difference between the source column at the end --- and the source column at the beginning). Vertical displacement --- (source row) is ignored. -withHorizDisplacement :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m a -- ^ Parser to apply - -> ParserT s st m (a, Int) -- ^ (result, displacement) -withHorizDisplacement parser = do - pos1 <- getPosition - result <- parser - pos2 <- getPosition - return (result, sourceColumn pos2 - sourceColumn pos1) - --- | Applies a parser and returns the raw string that was parsed, --- along with the value produced by the parser. -withRaw :: Monad m - => ParsecT Sources st m a - -> ParsecT Sources st m (a, Text) -withRaw parser = do - inps1 <- getInput - result <- parser - inps2 <- getInput - -- 'raw' is the difference between inps1 and inps2 - return (result, sourcesDifference inps1 inps2) - -sourcesDifference :: Sources -> Sources -> Text -sourcesDifference (Sources is1) (Sources is2) = go is1 is2 - where - go inps1 inps2 = - case (inps1, inps2) of - ([], _) -> mempty - (_, []) -> mconcat $ map snd inps1 - ((p1,t1):rest1, (p2, t2):rest2) - | p1 == p2 - , t1 == t2 -> go rest1 rest2 - | p1 == p2 - , t1 /= t2 -> fromMaybe mempty $ T.stripSuffix t2 t1 - | otherwise -> t1 <> go rest1 inps2 - --- | Parses backslash, then applies character parser. -escaped :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Char -- ^ Parser for character to escape - -> ParserT s st m Char -escaped parser = try $ char '\\' >> parser - --- | Parse character entity. -characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char -characterReference = try $ do - char '&' - ent <- many1Till nonspaceChar (char ';') - let ent' = case ent of - '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug - '#':_ -> ent - _ -> ent ++ ";" - case lookupEntity ent' of - Just (c : _) -> return c - _ -> Prelude.fail "entity not found" - --- | Parses an uppercase roman numeral and returns (UpperRoman, number). -upperRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -upperRoman = do - num <- romanNumeral True - return (UpperRoman, num) - --- | Parses a lowercase roman numeral and returns (LowerRoman, number). -lowerRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -lowerRoman = do - num <- romanNumeral False - return (LowerRoman, num) - --- | Parses a decimal numeral and returns (Decimal, number). -decimal :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -decimal = do - num <- many1 digit - return (Decimal, fromMaybe 1 $ safeRead $ T.pack num) - --- | Parses a '@' and optional label and --- returns (DefaultStyle, [next example number]). The next --- example number is incremented in parser state, and the label --- (if present) is added to the label table. -exampleNum :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s ParserState m (ListNumberStyle, Int) -exampleNum = do - char '@' - lab <- mconcat . map T.pack <$> - many (many1 alphaNum <|> - try (do c <- char '_' <|> char '-' - cs <- many1 alphaNum - return (c:cs))) - st <- getState - let num = stateNextExample st - let newlabels = if T.null lab - then stateExamples st - else M.insert lab num $ stateExamples st - updateState $ \s -> s{ stateNextExample = num + 1 - , stateExamples = newlabels } - return (Example, num) - --- | Parses a '#' returns (DefaultStyle, 1). -defaultNum :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -defaultNum = do - char '#' - return (DefaultStyle, 1) - --- | Parses a lowercase letter and returns (LowerAlpha, number). -lowerAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -lowerAlpha = do - ch <- satisfy isAsciiLower - return (LowerAlpha, ord ch - ord 'a' + 1) - --- | Parses an uppercase letter and returns (UpperAlpha, number). -upperAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -upperAlpha = do - ch <- satisfy isAsciiUpper - return (UpperAlpha, ord ch - ord 'A' + 1) - --- | Parses a roman numeral i or I -romanOne :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -romanOne = (char 'i' >> return (LowerRoman, 1)) <|> - (char 'I' >> return (UpperRoman, 1)) - --- | Parses an ordered list marker and returns list attributes. -anyOrderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s ParserState m ListAttributes -anyOrderedListMarker = choice - [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], - numParser <- [decimal, exampleNum, defaultNum, romanOne, - lowerAlpha, lowerRoman, upperAlpha, upperRoman]] - --- | Parses a list number (num) followed by a period, returns list attributes. -inPeriod :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m (ListNumberStyle, Int) - -> ParserT s st m ListAttributes -inPeriod num = try $ do - (style, start) <- num - char '.' - let delim = if style == DefaultStyle - then DefaultDelim - else Period - return (start, style, delim) - --- | Parses a list number (num) followed by a paren, returns list attributes. -inOneParen :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m (ListNumberStyle, Int) - -> ParserT s st m ListAttributes -inOneParen num = try $ do - (style, start) <- num - char ')' - return (start, style, OneParen) - --- | Parses a list number (num) enclosed in parens, returns list attributes. -inTwoParens :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m (ListNumberStyle, Int) - -> ParserT s st m ListAttributes -inTwoParens num = try $ do - char '(' - (style, start) <- num - char ')' - return (start, style, TwoParens) - --- | Parses an ordered list marker with a given style and delimiter, --- returns number. -orderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) - => ListNumberStyle - -> ListNumberDelim - -> ParserT s ParserState m Int -orderedListMarker style delim = do - let num = defaultNum <|> -- # can continue any kind of list - case style of - DefaultStyle -> decimal - Example -> exampleNum - Decimal -> decimal - UpperRoman -> upperRoman - LowerRoman -> lowerRoman - UpperAlpha -> upperAlpha - LowerAlpha -> lowerAlpha - let context = case delim of - DefaultDelim -> inPeriod - Period -> inPeriod - OneParen -> inOneParen - TwoParens -> inTwoParens - (start, _, _) <- context num - return start - --- | Parses a character reference and returns a Str element. -charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inline -charRef = Str . T.singleton <$> characterReference - -lineBlockLine :: Monad m => ParserT Sources st m Text -lineBlockLine = try $ do - char '|' - char ' ' - white <- T.pack <$> many (spaceChar >> return '\160') - notFollowedBy newline - line <- anyLine - continuations <- many (try $ char ' ' >> anyLine) - return $ white <> T.unwords (line : continuations) - -blankLineBlockLine :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char -blankLineBlockLine = try (char '|' >> blankline) - --- | Parses an RST-style line block and returns a list of strings. -lineBlockLines :: Monad m => ParserT Sources st m [Text] -lineBlockLines = try $ do - lines' <- many1 (lineBlockLine <|> (T.singleton <$> blankLineBlockLine)) - skipMany blankline - return lines' - --- | Parse a table using 'headerParser', 'rowParser', --- 'lineParser', and 'footerParser'. -tableWith :: (Stream s m Char, UpdateSourcePos s Char, - HasReaderOptions st, Monad mf) - => ParserT s st m (mf [Blocks], [Alignment], [Int]) - -> ([Int] -> ParserT s st m (mf [Blocks])) - -> ParserT s st m sep - -> ParserT s st m end - -> ParserT s st m (mf Blocks) -tableWith headerParser rowParser lineParser footerParser = try $ do - (aligns, widths, heads, rows) <- tableWith' headerParser rowParser - lineParser footerParser - let th = TableHead nullAttr <$> heads - tb = (:[]) . TableBody nullAttr 0 [] <$> rows - tf = pure $ TableFoot nullAttr [] - return $ B.table B.emptyCaption (zip aligns (map fromWidth widths)) <$> th <*> tb <*> tf - where - fromWidth n - | n > 0 = ColWidth n - | otherwise = ColWidthDefault - -type TableComponents mf = ([Alignment], [Double], mf [Row], mf [Row]) - -tableWith' :: (Stream s m Char, UpdateSourcePos s Char, - HasReaderOptions st, Monad mf) - => ParserT s st m (mf [Blocks], [Alignment], [Int]) - -> ([Int] -> ParserT s st m (mf [Blocks])) - -> ParserT s st m sep - -> ParserT s st m end - -> ParserT s st m (TableComponents mf) -tableWith' headerParser rowParser lineParser footerParser = try $ do - (heads, aligns, indices) <- headerParser - lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser - footerParser - numColumns <- getOption readerColumns - let widths = if null indices - then replicate (length aligns) 0.0 - else widthsFromIndices numColumns indices - let toRow = Row nullAttr . map B.simpleCell - toHeaderRow l = [toRow l | not (null l)] - return (aligns, widths, toHeaderRow <$> heads, map toRow <$> lines') - --- Calculate relative widths of table columns, based on indices -widthsFromIndices :: Int -- Number of columns on terminal - -> [Int] -- Indices - -> [Double] -- Fractional relative sizes of columns -widthsFromIndices _ [] = [] -widthsFromIndices numColumns' indices = - let numColumns = max numColumns' (if null indices then 0 else last indices) - lengths' = zipWith (-) indices (0:indices) - lengths = reverse $ - case reverse lengths' of - [] -> [] - [x] -> [x] - -- compensate for the fact that intercolumn - -- spaces are counted in widths of all columns - -- but the last... - (x:y:zs) -> if x < y && y - x <= 2 - then y:y:zs - else x:y:zs - totLength = sum lengths - quotient = if totLength > numColumns - then fromIntegral totLength - else fromIntegral numColumns - fracs = map (\l -> fromIntegral l / quotient) lengths in - tail fracs - ---- - --- Parse a grid table: starts with row of '-' on top, then header --- (which may be grid), then the rows, --- which may be grid, separated by blank lines, and --- ending with a footer (dashed line followed by blank line). -gridTableWith :: (Monad m, HasReaderOptions st, HasLastStrPosition st, Monad mf) - => ParserT Sources st m (mf Blocks) -- ^ Block list parser - -> Bool -- ^ Headerless table - -> ParserT Sources st m (mf Blocks) -gridTableWith blocks headless = - tableWith (gridTableHeader headless blocks) (gridTableRow blocks) - (gridTableSep '-') gridTableFooter - -gridTableWith' :: (Monad m, HasReaderOptions st, HasLastStrPosition st, - Monad mf) - => ParserT Sources st m (mf Blocks) -- ^ Block list parser - -> Bool -- ^ Headerless table - -> ParserT Sources st m (TableComponents mf) -gridTableWith' blocks headless = - tableWith' (gridTableHeader headless blocks) (gridTableRow blocks) - (gridTableSep '-') gridTableFooter - -gridTableSplitLine :: [Int] -> Text -> [Text] -gridTableSplitLine indices line = map removeFinalBar $ tail $ - splitTextByIndices (init indices) $ trimr line - -gridPart :: Monad m => Char -> ParserT Sources st m ((Int, Int), Alignment) -gridPart ch = do - leftColon <- option False (True <$ char ':') - dashes <- many1 (char ch) - rightColon <- option False (True <$ char ':') - char '+' - let lengthDashes = length dashes + (if leftColon then 1 else 0) + - (if rightColon then 1 else 0) - let alignment = case (leftColon, rightColon) of - (True, True) -> AlignCenter - (True, False) -> AlignLeft - (False, True) -> AlignRight - (False, False) -> AlignDefault - return ((lengthDashes, lengthDashes + 1), alignment) - -gridDashedLines :: Monad m => Char -> ParserT Sources st m [((Int, Int), Alignment)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline - -removeFinalBar :: Text -> Text -removeFinalBar = T.dropWhileEnd go . T.dropWhileEnd (=='|') - where - go c = T.any (== c) " \t" - --- | Separator between rows of grid table. -gridTableSep :: Monad m => Char -> ParserT Sources st m Char -gridTableSep ch = try $ gridDashedLines ch >> return '\n' - --- | Parse header for a grid table. -gridTableHeader :: (Monad m, Monad mf, HasLastStrPosition st) - => Bool -- ^ Headerless table - -> ParserT Sources st m (mf Blocks) - -> ParserT Sources st m (mf [Blocks], [Alignment], [Int]) -gridTableHeader True _ = do - optional blanklines - dashes <- gridDashedLines '-' - let aligns = map snd dashes - let lines' = map (snd . fst) dashes - let indices = scanl (+) 0 lines' - return (return [], aligns, indices) -gridTableHeader False blocks = try $ do - optional blanklines - dashes <- gridDashedLines '-' - rawContent <- many1 (notFollowedBy (gridTableSep '=') >> char '|' >> - T.pack <$> many1Till anyChar newline) - underDashes <- gridDashedLines '=' - guard $ length dashes == length underDashes - let lines' = map (snd . fst) underDashes - let indices = scanl (+) 0 lines' - let aligns = map snd underDashes - let rawHeads = map (T.unlines . map trim) $ transpose - $ map (gridTableSplitLine indices) rawContent - heads <- sequence <$> mapM (parseFromString' blocks . trim) rawHeads - return (heads, aligns, indices) - -gridTableRawLine :: (Stream s m Char, UpdateSourcePos s Char) => [Int] -> ParserT s st m [Text] -gridTableRawLine indices = do - char '|' - line <- many1Till anyChar newline - return (gridTableSplitLine indices $ T.pack line) - --- | Parse row of grid table. -gridTableRow :: (Monad m, Monad mf, HasLastStrPosition st) - => ParserT Sources st m (mf Blocks) - -> [Int] - -> ParserT Sources st m (mf [Blocks]) -gridTableRow blocks indices = do - colLines <- many1 (gridTableRawLine indices) - let cols = map ((<> "\n") . T.unlines . removeOneLeadingSpace) $ - transpose colLines - compactifyCell bs = case compactify [bs] of - [] -> mempty - x:_ -> x - cells <- sequence <$> mapM (parseFromString' blocks) cols - return $ fmap (map compactifyCell) cells - -removeOneLeadingSpace :: [Text] -> [Text] -removeOneLeadingSpace xs = - if all startsWithSpace xs - then map (T.drop 1) xs - else xs - where startsWithSpace t = case T.uncons t of - Nothing -> True - Just (c, _) -> c == ' ' - --- | Parse footer for a grid table. -gridTableFooter :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () -gridTableFooter = optional blanklines - ---- - --- | Removes the ParsecT layer from the monad transformer stack -readWithM :: (Monad m, ToSources t) - => ParserT Sources st m a -- ^ parser - -> st -- ^ initial state - -> t -- ^ input - -> m (Either PandocError a) -readWithM parser state input = - mapLeft (PandocParsecError sources) - <$> runParserT parser state (initialSourceName sources) sources - where - sources = toSources input - --- | Parse a string with a given parser and state -readWith :: ToSources t - => Parser Sources st a - -> st - -> t - -> Either PandocError a -readWith p t inp = runIdentity $ readWithM p t inp - --- | Parse a string with @parser@ (for testing). -testStringWith :: Show a - => ParserT Sources ParserState Identity a - -> Text - -> IO () -testStringWith parser str = UTF8.putStrLn $ tshow $ - readWith parser defaultParserState (toSources str) - --- | Parsing options. -data ParserState = ParserState - { stateOptions :: ReaderOptions, -- ^ User options - stateParserContext :: ParserContext, -- ^ Inside list? - stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? - stateAllowLinks :: Bool, -- ^ Allow parsing of links - stateAllowLineBreaks :: Bool, -- ^ Allow parsing of line breaks - stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph - stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed - stateKeys :: KeyTable, -- ^ List of reference keys - stateHeaderKeys :: KeyTable, -- ^ List of implicit header ref keys - stateSubstitutions :: SubstTable, -- ^ List of substitution references - stateNotes :: NoteTable, -- ^ List of notes (raw bodies) - stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) - stateNoteRefs :: Set.Set Text, -- ^ List of note references used - stateInNote :: Bool, -- ^ True if parsing note contents - stateNoteNumber :: Int, -- ^ Last note number for citations - stateMeta :: Meta, -- ^ Document metadata - stateMeta' :: Future ParserState Meta, -- ^ Document metadata - stateCitations :: M.Map Text Text, -- ^ RST-style citations - stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used - stateIdentifiers :: Set.Set Text, -- ^ Header identifiers used - stateNextExample :: Int, -- ^ Number of next example - stateExamples :: M.Map Text Int, -- ^ Map from example labels to numbers - stateMacros :: M.Map Text Macro, -- ^ Table of macros defined so far - stateRstDefaultRole :: Text, -- ^ Current rST default interpreted text role - stateRstHighlight :: Maybe Text, -- ^ Current rST literal block language - stateRstCustomRoles :: M.Map Text (Text, Maybe Text, Attr), -- ^ Current rST custom text roles - -- Triple represents: 1) Base role, 2) Optional format (only for :raw: - -- roles), 3) Additional classes (rest of Attr is unused)). - stateCaption :: Maybe Inlines, -- ^ Caption in current environment - stateInHtmlBlock :: Maybe Text, -- ^ Tag type of HTML block being parsed - stateFencedDivLevel :: Int, -- ^ Depth of fenced div - stateContainers :: [Text], -- ^ parent include files - stateLogMessages :: [LogMessage], -- ^ log messages - stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context - } - -instance Default ParserState where - def = defaultParserState - -instance HasMeta ParserState where - setMeta field val st = - st{ stateMeta = setMeta field val $ stateMeta st } - deleteMeta field st = - st{ stateMeta = deleteMeta field $ stateMeta st } - -class HasReaderOptions st where - extractReaderOptions :: st -> ReaderOptions - getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b - -- default - getOption f = f . extractReaderOptions <$> getState - -instance HasReaderOptions ParserState where - extractReaderOptions = stateOptions - -class HasQuoteContext st m where - getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext - withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a - -instance Monad m => HasQuoteContext ParserState m where - getQuoteContext = stateQuoteContext <$> getState - withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -class HasIdentifierList st where - extractIdentifierList :: st -> Set.Set Text - updateIdentifierList :: (Set.Set Text -> Set.Set Text) -> st -> st - -instance HasIdentifierList ParserState where - extractIdentifierList = stateIdentifiers - updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } - -class HasMacros st where - extractMacros :: st -> M.Map Text Macro - updateMacros :: (M.Map Text Macro -> M.Map Text Macro) -> st -> st - -instance HasMacros ParserState where - extractMacros = stateMacros - updateMacros f st = st{ stateMacros = f $ stateMacros st } - -class HasLastStrPosition st where - setLastStrPos :: Maybe SourcePos -> st -> st - getLastStrPos :: st -> Maybe SourcePos - -instance HasLastStrPosition ParserState where - setLastStrPos pos st = st{ stateLastStrPos = pos } - getLastStrPos st = stateLastStrPos st - -class HasLogMessages st where - addLogMessage :: LogMessage -> st -> st - getLogMessages :: st -> [LogMessage] - -instance HasLogMessages ParserState where - addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st } - getLogMessages st = reverse $ stateLogMessages st - -class HasIncludeFiles st where - getIncludeFiles :: st -> [Text] - addIncludeFile :: Text -> st -> st - dropLatestIncludeFile :: st -> st - -instance HasIncludeFiles ParserState where - getIncludeFiles = stateContainers - addIncludeFile f s = s{ stateContainers = f : stateContainers s } - dropLatestIncludeFile s = s { stateContainers = drop 1 $ stateContainers s } - -defaultParserState :: ParserState -defaultParserState = - ParserState { stateOptions = def, - stateParserContext = NullState, - stateQuoteContext = NoQuote, - stateAllowLinks = True, - stateAllowLineBreaks = True, - stateMaxNestingLevel = 6, - stateLastStrPos = Nothing, - stateKeys = M.empty, - stateHeaderKeys = M.empty, - stateSubstitutions = M.empty, - stateNotes = [], - stateNotes' = M.empty, - stateNoteRefs = Set.empty, - stateInNote = False, - stateNoteNumber = 0, - stateMeta = nullMeta, - stateMeta' = return nullMeta, - stateCitations = M.empty, - stateHeaderTable = [], - stateIdentifiers = Set.empty, - stateNextExample = 1, - stateExamples = M.empty, - stateMacros = M.empty, - stateRstDefaultRole = "title-reference", - stateRstHighlight = Nothing, - stateRstCustomRoles = M.empty, - stateCaption = Nothing, - stateInHtmlBlock = Nothing, - stateFencedDivLevel = 0, - stateContainers = [], - stateLogMessages = [], - stateMarkdownAttribute = False - } - --- | Add a log message. -logMessage :: (Stream s m a, HasLogMessages st) - => LogMessage -> ParserT s st m () -logMessage msg = updateState (addLogMessage msg) - --- | Report all the accumulated log messages, according to verbosity level. -reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m () -reportLogMessages = do - msgs <- getLogMessages <$> getState - mapM_ report msgs - --- | Succeed only if the extension is enabled. -guardEnabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () -guardEnabled ext = getOption readerExtensions >>= guard . extensionEnabled ext - --- | Succeed only if the extension is disabled. -guardDisabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () -guardDisabled ext = getOption readerExtensions >>= guard . not . extensionEnabled ext - --- | Update the position on which the last string ended. -updateLastStrPos :: (Stream s m a, HasLastStrPosition st) => ParserT s st m () -updateLastStrPos = getPosition >>= updateState . setLastStrPos . Just - --- | Whether we are right after the end of a string. -notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool -notAfterString = do - pos <- getPosition - st <- getState - return $ getLastStrPos st /= Just pos - -data HeaderType - = SingleHeader Char -- ^ Single line of characters underneath - | DoubleHeader Char -- ^ Lines of characters above and below - deriving (Eq, Show) - -data ParserContext - = ListItemState -- ^ Used when running parser on list item contents - | NullState -- ^ Default state - deriving (Eq, Show) - -data QuoteContext - = InSingleQuote -- ^ Used when parsing inside single quotes - | InDoubleQuote -- ^ Used when parsing inside double quotes - | NoQuote -- ^ Used when not parsing inside quotes - deriving (Eq, Show) - -type NoteTable = [(Text, Text)] - -type NoteTable' = M.Map Text (SourcePos, Future ParserState Blocks) --- used in markdown reader - -newtype Key = Key Text deriving (Show, Read, Eq, Ord) - -toKey :: Text -> Key -toKey = Key . T.toLower . T.unwords . T.words . unbracket - where unbracket t - | Just ('[', t') <- T.uncons t - , Just (t'', ']') <- T.unsnoc t' - = t'' - | otherwise - = t - -type KeyTable = M.Map Key (Target, Attr) - -type SubstTable = M.Map Key Inlines - --- | Add header to the list of headers in state, together --- with its associated identifier. If the identifier is null --- and the auto_identifiers extension is set, generate a new --- unique identifier, and update the list of identifiers --- in state. Issue a warning if an explicit identifier --- is encountered that duplicates an earlier identifier --- (explicit or automatically generated). -registerHeader :: (Stream s m a, HasReaderOptions st, - HasLogMessages st, HasIdentifierList st) - => Attr -> Inlines -> ParserT s st m Attr -registerHeader (ident,classes,kvs) header' = do - ids <- extractIdentifierList <$> getState - exts <- getOption readerExtensions - if T.null ident && Ext_auto_identifiers `extensionEnabled` exts - then do - let id' = uniqueIdent exts (B.toList header') ids - let id'' = if Ext_ascii_identifiers `extensionEnabled` exts - then toAsciiText id' - else id' - updateState $ updateIdentifierList $ Set.insert id' - updateState $ updateIdentifierList $ Set.insert id'' - return (id'',classes,kvs) - else do - unless (T.null ident) $ do - when (ident `Set.member` ids) $ do - pos <- getPosition - logMessage $ DuplicateIdentifier ident pos - updateState $ updateIdentifierList $ Set.insert ident - return (ident,classes,kvs) - -smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, - HasQuoteContext st m, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines - -> ParserT s st m Inlines -smartPunctuation inlineParser = do - guardEnabled Ext_smart - choice [ quoted inlineParser, apostrophe, doubleCloseQuote, dash, ellipses ] - -quoted :: (HasLastStrPosition st, HasQuoteContext st m, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines - -> ParserT s st m Inlines -quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser - -singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines - -> ParserT s st m Inlines -singleQuoted inlineParser = do - singleQuoteStart - (B.singleQuoted . mconcat <$> - try - (withQuoteContext InSingleQuote (many1Till inlineParser singleQuoteEnd))) - <|> pure "\8217" - -doubleQuoted :: (HasQuoteContext st m, HasLastStrPosition st, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines - -> ParserT s st m Inlines -doubleQuoted inlineParser = do - doubleQuoteStart - (B.doubleQuoted . mconcat <$> - try - (withQuoteContext InDoubleQuote (manyTill inlineParser doubleQuoteEnd))) - <|> pure (B.str "\8220") - -failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) - => QuoteContext - -> ParserT s st m () -failIfInQuoteContext context = do - context' <- getQuoteContext - when (context' == context) $ Prelude.fail "already inside quotes" - -charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParserT s st m Char -charOrRef cs = - oneOf cs <|> try (do c <- characterReference - guard (c `elem` cs) - return c) - -singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m () -singleQuoteStart = do - failIfInQuoteContext InSingleQuote - -- single quote start can't be right after str - guard =<< notAfterString - try $ do - charOrRef "'\8216\145" - void $ lookAhead (satisfy (not . isSpaceChar)) - -singleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m () -singleQuoteEnd = try $ do - charOrRef "'\8217\146" - notFollowedBy alphaNum - -doubleQuoteStart :: (HasLastStrPosition st, - HasQuoteContext st m, - Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m () -doubleQuoteStart = do - failIfInQuoteContext InDoubleQuote - guard =<< notAfterString - try $ do charOrRef "\"\8220\147" - void $ lookAhead (satisfy (not . isSpaceChar)) - -doubleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m () -doubleQuoteEnd = void (charOrRef "\"\8221\148") - -apostrophe :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\8217") - -doubleCloseQuote :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -doubleCloseQuote = B.str "\8221" <$ char '"' - -ellipses :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines -ellipses = try (string "..." >> return (B.str "\8230")) - -dash :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Inlines -dash = try $ do - oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions - if oldDashes - then do - char '-' - (char '-' >> return (B.str "\8212")) - <|> (lookAhead digit >> return (B.str "\8211")) - else do - string "--" - (char '-' >> return (B.str "\8212")) - <|> return (B.str "\8211") - --- This is used to prevent exponential blowups for things like: --- a**a*a**a*a**a*a**a*a**a*a**a*a** -nested :: Stream s m a - => ParserT s ParserState m a - -> ParserT s ParserState m a -nested p = do - nestlevel <- stateMaxNestingLevel <$> getState - guard $ nestlevel > 0 - updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } - res <- p - updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } - return res - -citeKey :: (Stream s m Char, UpdateSourcePos s Char, HasLastStrPosition st) - => Bool -- ^ If True, allow expanded @{..} syntax. - -> ParserT s st m (Bool, Text) -citeKey allowBraced = try $ do - guard =<< notAfterString - suppress_author <- option False (True <$ char '-') - char '@' - key <- simpleCiteIdentifier - <|> if allowBraced - then charsInBalanced '{' '}' (satisfy (not . isSpace)) - else mzero - return (suppress_author, key) - -simpleCiteIdentifier :: (Stream s m Char, UpdateSourcePos s Char) - => ParserT s st m Text -simpleCiteIdentifier = do - firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite - let regchar = satisfy (\c -> isAlphaNum c || c == '_') - let internal p = try $ p <* lookAhead regchar - rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|> - try (oneOf ":/" <* lookAhead (char '/')) - return $ T.pack $ firstChar:rest - -token :: (Stream s m t) - => (t -> Text) - -> (t -> SourcePos) - -> (t -> Maybe a) - -> ParsecT s st m a -token pp pos match = tokenPrim (T.unpack . pp) (\_ t _ -> pos t) match - -infixr 5 <+?> -(<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a -a <+?> b = a >>= flip fmap (try b <|> return mempty) . mappend - -extractIdClass :: Attr -> Attr -extractIdClass (ident, cls, kvs) = (ident', cls', kvs') - where - ident' = fromMaybe ident (lookup "id" kvs) - cls' = maybe cls T.words $ lookup "class" kvs - kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs - -insertIncludedFile :: (PandocMonad m, HasIncludeFiles st) - => ParserT a st m b -- ^ parser to apply - -> (Text -> a) -- ^ convert Text to stream type - -> [FilePath] -- ^ search path (directories) - -> FilePath -- ^ path of file to include - -> Maybe Int -- ^ start line (negative counts from end) - -> Maybe Int -- ^ end line (negative counts from end) - -> ParserT a st m b -insertIncludedFile parser toStream dirs f mbstartline mbendline = do - oldPos <- getPosition - oldInput <- getInput - containers <- getIncludeFiles <$> getState - when (T.pack f `elem` containers) $ - throwError $ PandocParseError $ T.pack $ "Include file loop at " ++ show oldPos - updateState $ addIncludeFile $ T.pack f - mbcontents <- readFileFromDirs dirs f - contents <- case mbcontents of - Just s -> return $ exciseLines mbstartline mbendline s - Nothing -> do - report $ CouldNotLoadIncludeFile (T.pack f) oldPos - return "" - setInput $ toStream contents - setPosition $ newPos f (fromMaybe 1 mbstartline) 1 - result <- parser - setInput oldInput - setPosition oldPos - updateState dropLatestIncludeFile - return result - -exciseLines :: Maybe Int -> Maybe Int -> Text -> Text -exciseLines Nothing Nothing t = t -exciseLines mbstartline mbendline t = - T.unlines $ take (endline' - (startline' - 1)) - $ drop (startline' - 1) contentLines - where - contentLines = T.lines t - numLines = length contentLines - startline' = case mbstartline of - Nothing -> 1 - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - endline' = case mbendline of - Nothing -> numLines - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - +import Text.Pandoc.Parsing.Capabilities +import Text.Pandoc.Parsing.Citations +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.GridTable +import Text.Pandoc.Parsing.Lists +import Text.Pandoc.Parsing.Math +import Text.Pandoc.Parsing.Smart +import Text.Pandoc.Parsing.State +import Text.Pandoc.Parsing.Types diff --git a/src/Text/Pandoc/Parsing/Capabilities.hs b/src/Text/Pandoc/Parsing/Capabilities.hs new file mode 100644 index 000000000..770b307a3 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Capabilities.hs @@ -0,0 +1,137 @@ +{-# LANGUAGE MultiParamTypeClasses #-} +{- | + Module : Text.Pandoc.Parsing + Copyright : © 2006-2022 John MacFarlane + License : GPL-2.0-or-later + Maintainer : John MacFarlane <[email protected]> + +Parser state capabilities. +-} +module Text.Pandoc.Parsing.Capabilities + ( -- * Capabilities + + -- ** Element identifiers + HasIdentifierList (..) + + -- ** Include files + , HasIncludeFiles (..) + + -- ** String/Word boundaries + , HasLastStrPosition (..) + , updateLastStrPos + , notAfterString + + -- ** Logging + , HasLogMessages (..) + , logMessage + , reportLogMessages + + -- ** Macros + , HasMacros (..) + + -- ** Quote context + , QuoteContext (..) + , HasQuoteContext (..) + , failIfInQuoteContext + + -- ** Reader options + , HasReaderOptions (..) + , guardEnabled + , guardDisabled + ) +where + +import Control.Monad (guard, when) +import Data.Text (Text) +import Text.Parsec (ParsecT, SourcePos, Stream, getPosition, getState, updateState) +import Text.Pandoc.Class.PandocMonad (PandocMonad, report) +import Text.Pandoc.Logging (LogMessage) +import Text.Pandoc.Options + ( Extension + , ReaderOptions(readerExtensions) + , extensionEnabled + ) +import Text.Pandoc.Parsing.Types +import Text.Pandoc.Readers.LaTeX.Types (Macro) + +import qualified Data.Map as M +import qualified Data.Set as Set + +class HasReaderOptions st where + extractReaderOptions :: st -> ReaderOptions + getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b + -- default + getOption f = f . extractReaderOptions <$> getState + +class HasQuoteContext st m where + getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext + withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a + +failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) + => QuoteContext + -> ParserT s st m () +failIfInQuoteContext context = do + context' <- getQuoteContext + when (context' == context) $ Prelude.fail "already inside quotes" + +class HasIdentifierList st where + extractIdentifierList :: st -> Set.Set Text + updateIdentifierList :: (Set.Set Text -> Set.Set Text) -> st -> st + +class HasMacros st where + extractMacros :: st -> M.Map Text Macro + updateMacros :: (M.Map Text Macro -> M.Map Text Macro) -> st -> st + +class HasLastStrPosition st where + setLastStrPos :: Maybe SourcePos -> st -> st + getLastStrPos :: st -> Maybe SourcePos + +class HasLogMessages st where + addLogMessage :: LogMessage -> st -> st + getLogMessages :: st -> [LogMessage] + +class HasIncludeFiles st where + getIncludeFiles :: st -> [Text] + addIncludeFile :: Text -> st -> st + dropLatestIncludeFile :: st -> st + +-- | Add a log message. +logMessage :: (Stream s m a, HasLogMessages st) + => LogMessage -> ParserT s st m () +logMessage msg = updateState (addLogMessage msg) + +-- | Report all the accumulated log messages, according to verbosity level. +reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m () +reportLogMessages = do + msgs <- getLogMessages <$> getState + mapM_ report msgs + +-- | Succeed only if the extension is enabled. +guardEnabled :: (Stream s m a, HasReaderOptions st) + => Extension -> ParserT s st m () +guardEnabled ext = + getOption readerExtensions >>= guard . extensionEnabled ext + +-- | Succeed only if the extension is disabled. +guardDisabled :: (Stream s m a, HasReaderOptions st) + => Extension -> ParserT s st m () +guardDisabled ext = + getOption readerExtensions >>= guard . not . extensionEnabled ext + +-- | Update the position on which the last string ended. +updateLastStrPos :: (Stream s m a, HasLastStrPosition st) + => ParserT s st m () +updateLastStrPos = getPosition >>= updateState . setLastStrPos . Just + +-- | Whether we are right after the end of a string. +notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool +notAfterString = do + pos <- getPosition + st <- getState + return $ getLastStrPos st /= Just pos + +data QuoteContext + = InSingleQuote -- ^ Used when parsing inside single quotes + | InDoubleQuote -- ^ Used when parsing inside double quotes + | NoQuote -- ^ Used when not parsing inside quotes + deriving (Eq, Show) diff --git a/src/Text/Pandoc/Parsing/Citations.hs b/src/Text/Pandoc/Parsing/Citations.hs new file mode 100644 index 000000000..54cb2ee60 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Citations.hs @@ -0,0 +1,57 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Citations +Copyright : © 2006-2022 John MacFarlane +License : GPL-2.0-or-later +Maintainer : John MacFarlane <[email protected]> + +Citation parsing. +-} + +module Text.Pandoc.Parsing.Citations + ( citeKey + ) +where + +import Control.Monad (guard, MonadPlus(mzero)) +import Data.Char (isAlphaNum , isSpace) +import Data.Text (Text) +import Text.Pandoc.Sources +import Text.Parsec + ( (<|>) + , Stream(..) + , lookAhead + , many + , option + , try + ) +import Text.Pandoc.Parsing.Capabilities (HasLastStrPosition, notAfterString) +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.Types (ParserT) + +import qualified Data.Text as T + +citeKey :: (Stream s m Char, UpdateSourcePos s Char, HasLastStrPosition st) + => Bool -- ^ If True, allow expanded @{..} syntax. + -> ParserT s st m (Bool, Text) +citeKey allowBraced = try $ do + guard =<< notAfterString + suppress_author <- option False (True <$ char '-') + char '@' + key <- simpleCiteIdentifier + <|> if allowBraced + then charsInBalanced '{' '}' (satisfy (not . isSpace)) + else mzero + return (suppress_author, key) + +simpleCiteIdentifier :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text +simpleCiteIdentifier = do + firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite + let regchar = satisfy (\c -> isAlphaNum c || c == '_') + let internal p = try $ p <* lookAhead regchar + rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|> + try (oneOf ":/" <* lookAhead (char '/')) + return $ T.pack $ firstChar:rest + diff --git a/src/Text/Pandoc/Parsing/Combinators.hs b/src/Text/Pandoc/Parsing/Combinators.hs new file mode 100644 index 000000000..cb18662e0 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Combinators.hs @@ -0,0 +1,751 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Combinators +Copyright : © 2006-2022 John MacFarlane +License : GPL-2.0-or-later +Maintainer : John MacFarlane <[email protected]> + +Parser combinators for pandoc format readers. +-} + +module Text.Pandoc.Parsing.Combinators + ( (<+?>) + , anyLine + , anyLineNewline + , blankline + , blanklines + , charRef + , characterReference + , charsInBalanced + , countChar + , emailAddress + , enclosed + , escaped + , extractIdClass + , gobbleAtMostSpaces + , gobbleSpaces + , indentWith + , insertIncludedFile + , isSpaceChar -- not re-exported from T.P.Parsing + , lineBlockLines + , lineClump + , many1Char + , many1Till + , many1TillChar + , manyChar + , manyTillChar + , manyUntil + , manyUntilChar + , nested + , nonspaceChar + , notFollowedBy' + , oneOfStrings + , oneOfStringsCI + , parseFromString + , parseFromString' + , readWith + , readWithM + , registerHeader + , sepBy1' + , skipSpaces + , spaceChar + , stringAnyCase + , testStringWith + , textStr + , token + , trimInlinesF + , uri + , withHorizDisplacement + , withRaw + ) +where + +import Control.Monad + ( guard + , join + , liftM + , unless + , void + , when + ) +import Control.Monad.Except ( MonadError(throwError) ) +import Control.Monad.Identity ( Identity(..), MonadPlus(mzero) ) +import Data.Char + ( chr + , isAlphaNum + , isAscii + , isAsciiUpper + , isSpace + , ord + , toLower + , toUpper + ) +import Data.Functor (($>)) +import Data.List (intercalate) +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Asciify (toAsciiText) +import Text.Pandoc.Builder (Attr, Inline(Str), Inlines, trimInlines) +import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report) +import Text.Pandoc.Logging + ( LogMessage(CouldNotLoadIncludeFile, DuplicateIdentifier) ) +import Text.Pandoc.Options + ( extensionEnabled + , Extension(Ext_auto_identifiers, Ext_ascii_identifiers) + , ReaderOptions(readerTabStop, readerExtensions) ) +import Text.Pandoc.Shared (escapeURI, mapLeft, schemes, tshow, uniqueIdent) +import Text.Pandoc.Sources +import Text.Pandoc.XML (fromEntities) +import Text.Parsec + ( (<|>) + , ParsecT + , SourcePos + , Stream(..) + , between + , choice + , count + , getInput + , getPosition + , getState + , lookAhead + , many + , many1 + , manyTill + , notFollowedBy + , option + , runParserT + , setInput + , setPosition + , skipMany + , sourceColumn + , sourceName + , tokenPrim + , try + , unexpected + , updateState + ) +import Text.Parsec.Pos (initialPos, newPos) +import Text.Pandoc.Error + ( PandocError(PandocParseError, PandocParsecError) ) +import Text.Pandoc.Parsing.Capabilities +import Text.Pandoc.Parsing.State +import Text.Pandoc.Parsing.Types ( Parser, ParserT, Future (..)) + +import qualified Data.Set as Set +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B +import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) + +-- | Remove whitespace from start and end; just like @'trimInlines'@, +-- but lifted into the 'Future' type. +trimInlinesF :: Future s Inlines -> Future s Inlines +trimInlinesF = liftM trimInlines + +-- | Like @count@, but packs its result +countChar :: (Stream s m Char, UpdateSourcePos s Char, Monad m) + => Int + -> ParsecT s st m Char + -> ParsecT s st m Text +countChar n = fmap T.pack . count n + +-- | Like @string@, but uses @Text@. +textStr :: (Stream s m Char, UpdateSourcePos s Char) + => Text -> ParsecT s u m Text +textStr t = string (T.unpack t) $> t + + +-- | Parse any line of text, returning the contents without the +-- final newline. +anyLine :: Monad m => ParserT Sources st m Text +anyLine = do + -- This is much faster than: + -- manyTill anyChar newline + inp <- getInput + case inp of + Sources [] -> mzero + Sources ((fp,t):inps) -> + -- we assume that lines don't span different input files + case T.break (=='\n') t of + (this, rest) + | T.null rest + , not (null inps) -> + -- line may span different input files, so do it + -- character by character + T.pack <$> manyTill anyChar newline + | otherwise -> do -- either end of inputs or newline in rest + setInput $ Sources ((fp, rest):inps) + char '\n' -- needed so parsec knows we won't match empty string + -- and so source pos is updated + return this + +-- | Parse any line, include the final newline in the output +anyLineNewline :: Monad m => ParserT Sources st m Text +anyLineNewline = (<> "\n") <$> anyLine + +-- | Parse indent by specified number of spaces (or equiv. tabs) +indentWith :: (Stream s m Char, UpdateSourcePos s Char) + => HasReaderOptions st + => Int -> ParserT s st m Text +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then countChar num (char ' ') + else choice [ try (countChar num (char ' ')) + , try (char '\t' >> indentWith (num - tabStop)) ] + +-- | Like @many@, but packs its result. +manyChar :: Stream s m t + => ParserT s st m Char + -> ParserT s st m Text +manyChar = fmap T.pack . many + +-- | Like @many1@, but packs its result. +many1Char :: Stream s m t + => ParserT s st m Char + -> ParserT s st m Text +many1Char = fmap T.pack . many1 + +-- | Like @manyTill@, but packs its result. +manyTillChar :: Stream s m t + => ParserT s st m Char + -> ParserT s st m a + -> ParserT s st m Text +manyTillChar p = fmap T.pack . manyTill p + +-- | Like @manyTill@, but reads at least one item. +many1Till :: (Show end, Stream s m t) + => ParserT s st m a + -> ParserT s st m end + -> ParserT s st m [a] +many1Till p end = do + notFollowedBy' end + first <- p + rest <- manyTill p end + return (first:rest) + +-- | Like @many1Till@, but packs its result +many1TillChar :: (Show end, Stream s m t) + => ParserT s st m Char + -> ParserT s st m end + -> ParserT s st m Text +many1TillChar p = fmap T.pack . many1Till p + +-- | Like @manyTill@, but also returns the result of end parser. +manyUntil :: ParserT s u m a + -> ParserT s u m b + -> ParserT s u m ([a], b) +manyUntil p end = scan + where scan = + (do e <- end + return ([], e) + ) <|> + (do x <- p + (xs, e) <- scan + return (x:xs, e)) + +-- | Like @manyUntil@, but also packs its result. +manyUntilChar :: ParserT s u m Char + -> ParserT s u m b + -> ParserT s u m (Text, b) +manyUntilChar p = fmap go . manyUntil p + where + go (x, y) = (T.pack x, y) + +-- | Like @sepBy1@ from Parsec, +-- but does not fail if it @sep@ succeeds and @p@ fails. +sepBy1' :: ParsecT s u m a + -> ParsecT s u m sep + -> ParsecT s u m [a] +sepBy1' p sep = (:) <$> p <*> many (try $ sep >> p) + +-- | A more general form of @notFollowedBy@. This one allows any +-- type of parser to be specified, and succeeds only if that parser fails. +-- It does not consume any input. +notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m () +notFollowedBy' p = try $ join $ do a <- try p + return (unexpected (show a)) + <|> + return (return ()) +-- (This version due to Andrew Pimlott on the Haskell mailing list.) + +oneOfStrings' :: (Stream s m Char, UpdateSourcePos s Char) + => (Char -> Char -> Bool) -> [Text] -> ParserT s st m Text +oneOfStrings' f = fmap T.pack . oneOfStrings'' f . fmap T.unpack + +-- TODO: This should be re-implemented in a Text-aware way +oneOfStrings'' :: (Stream s m Char, UpdateSourcePos s Char) + => (Char -> Char -> Bool) -> [String] -> ParserT s st m String +oneOfStrings'' _ [] = Prelude.fail "no strings" +oneOfStrings'' matches strs = try $ do + c <- anyChar + let strs' = [xs | (x:xs) <- strs, x `matches` c] + case strs' of + [] -> Prelude.fail "not found" + _ -> (c:) <$> oneOfStrings'' matches strs' + <|> if "" `elem` strs' + then return [c] + else Prelude.fail "not found" + +-- | Parses one of a list of strings. If the list contains +-- two strings one of which is a prefix of the other, the longer +-- string will be matched if possible. +oneOfStrings :: (Stream s m Char, UpdateSourcePos s Char) + => [Text] -> ParserT s st m Text +oneOfStrings = oneOfStrings' (==) + +-- | Parses one of a list of strings (tried in order), case insensitive. + +-- TODO: This will not be accurate with general Unicode (neither +-- Text.toLower nor Text.toCaseFold can be implemented with a map) +oneOfStringsCI :: (Stream s m Char, UpdateSourcePos s Char) + => [Text] -> ParserT s st m Text +oneOfStringsCI = oneOfStrings' ciMatch + where ciMatch x y = toLower' x == toLower' y + -- this optimizes toLower by checking common ASCII case + -- first, before calling the expensive unicode-aware + -- function: + toLower' c | isAsciiUpper c = chr (ord c + 32) + | isAscii c = c + | otherwise = toLower c + +-- | Parses a space or tab. +spaceChar :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char +spaceChar = satisfy $ \c -> c == ' ' || c == '\t' + +-- | Parses a nonspace, nonnewline character. +nonspaceChar :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char +nonspaceChar = satisfy (not . isSpaceChar) + +isSpaceChar :: Char -> Bool +isSpaceChar ' ' = True +isSpaceChar '\t' = True +isSpaceChar '\n' = True +isSpaceChar '\r' = True +isSpaceChar _ = False + +-- | Skips zero or more spaces or tabs. +skipSpaces :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () +skipSpaces = skipMany spaceChar + +-- | Skips zero or more spaces or tabs, then reads a newline. +blankline :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char +blankline = try $ skipSpaces >> newline + +-- | Parses one or more blank lines and returns a string of newlines. +blanklines :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text +blanklines = T.pack <$> many1 blankline + +-- | Gobble n spaces; if tabs are encountered, expand them +-- and gobble some or all of their spaces, leaving the rest. +gobbleSpaces :: (HasReaderOptions st, Monad m) + => Int -> ParserT Sources st m () +gobbleSpaces 0 = return () +gobbleSpaces n + | n < 0 = error "gobbleSpaces called with negative number" + | otherwise = try $ do + char ' ' <|> eatOneSpaceOfTab + gobbleSpaces (n - 1) + +eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT Sources st m Char +eatOneSpaceOfTab = do + lookAhead (char '\t') + pos <- getPosition + tabstop <- getOption readerTabStop + -- replace the tab on the input stream with spaces + let numSpaces = tabstop - ((sourceColumn pos - 1) `mod` tabstop) + inp <- getInput + setInput $ + case inp of + Sources [] -> error "eatOneSpaceOfTab - empty Sources list" + Sources ((fp,t):rest) -> + -- drop the tab and add spaces + Sources ((fp, T.replicate numSpaces " " <> T.drop 1 t):rest) + char ' ' + +-- | Gobble up to n spaces; if tabs are encountered, expand them +-- and gobble some or all of their spaces, leaving the rest. +gobbleAtMostSpaces :: (HasReaderOptions st, Monad m) + => Int -> ParserT Sources st m Int +gobbleAtMostSpaces 0 = return 0 +gobbleAtMostSpaces n + | n < 0 = error "gobbleAtMostSpaces called with negative number" + | otherwise = option 0 $ do + char ' ' <|> eatOneSpaceOfTab + (+ 1) <$> gobbleAtMostSpaces (n - 1) + +-- | Parses material enclosed between start and end parsers. +enclosed :: (Show end, Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m t -- ^ start parser + -> ParserT s st m end -- ^ end parser + -> ParserT s st m a -- ^ content parser (to be used repeatedly) + -> ParserT s st m [a] +enclosed start end parser = try $ + start >> notFollowedBy space >> many1Till parser end + +-- | Parse string, case insensitive. +stringAnyCase :: (Stream s m Char, UpdateSourcePos s Char) + => Text -> ParserT s st m Text +stringAnyCase = fmap T.pack . stringAnyCase' . T.unpack + +stringAnyCase' :: (Stream s m Char, UpdateSourcePos s Char) + => String -> ParserT s st m String +stringAnyCase' [] = string "" +stringAnyCase' (x:xs) = do + firstChar <- char (toUpper x) <|> char (toLower x) + rest <- stringAnyCase' xs + return (firstChar:rest) + +-- TODO rewrite by just adding to Sources stream? +-- | Parse contents of 'str' using 'parser' and return result. +parseFromString :: Monad m + => ParserT Sources st m r + -> Text + -> ParserT Sources st m r +parseFromString parser str = do + oldPos <- getPosition + oldInput <- getInput + setInput $ toSources str + setPosition $ initialPos $ sourceName oldPos <> "_chunk" + result <- parser + spaces + setInput oldInput + setPosition oldPos + return result + +-- | Like 'parseFromString' but specialized for 'ParserState'. +-- This resets 'stateLastStrPos', which is almost always what we want. +parseFromString' :: (Monad m, HasLastStrPosition u) + => ParserT Sources u m a + -> Text + -> ParserT Sources u m a +parseFromString' parser str = do + oldLastStrPos <- getLastStrPos <$> getState + updateState $ setLastStrPos Nothing + res <- parseFromString parser str + updateState $ setLastStrPos oldLastStrPos + return res + +-- | Parse raw line block up to and including blank lines. +lineClump :: Monad m => ParserT Sources st m Text +lineClump = blanklines + <|> (T.unlines <$> many1 (notFollowedBy blankline >> anyLine)) + +-- | Parse a string of characters between an open character +-- and a close character, including text between balanced +-- pairs of open and close, which must be different. For example, +-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" +-- and return "hello (there)". +charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParserT s st m Char + -> ParserT s st m Text +charsInBalanced open close parser = try $ do + char open + let isDelim c = c == open || c == close + raw <- many $ T.pack <$> many1 (notFollowedBy (satisfy isDelim) >> parser) + <|> (do res <- charsInBalanced open close parser + return $ T.singleton open <> res <> T.singleton close) + char close + return $ T.concat raw + +-- old charsInBalanced would be: +-- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline) +-- old charsInBalanced' would be: +-- charsInBalanced open close anyChar + +-- Parsers for email addresses and URIs + +-- | Parses an email address; returns original and corresponding +-- escaped mailto: URI. +emailAddress :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) +emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) + where toResult mbox dom = let full = fromEntities $ T.pack $ mbox ++ '@':dom + in (full, escapeURI $ "mailto:" <> full) + mailbox = intercalate "." <$> (emailWord `sepBy1'` dot) + domain = intercalate "." <$> (subdomain `sepBy1'` dot) + dot = char '.' + subdomain = many1 $ alphaNum <|> innerPunct (=='-') + -- this excludes some valid email addresses, since an + -- email could contain e.g. '__', but gives better results + -- for our purposes, when combined with markdown parsing: + innerPunct f = try (satisfy f + <* notFollowedBy (satisfy (not . isAlphaNum))) + -- technically an email address could begin with a symbol, + -- but allowing this creates too many problems. + -- See e.g. https://github.com/jgm/pandoc/issues/2940 + emailWord = do x <- satisfy isAlphaNum + xs <- many (satisfy isEmailChar) + return (x:xs) + isEmailChar c = isAlphaNum c || isEmailPunct c + isEmailPunct c = T.any (== c) "!\"#$%&'*+-/=?^_{|}~;" + + +uriScheme :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text +uriScheme = oneOfStringsCI (Set.toList schemes) + +-- | Parses a URI. Returns pair of original and URI-escaped version. +uri :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) +uri = try $ do + scheme <- uriScheme + char ':' + -- Avoid parsing e.g. "**Notes:**" as a raw URI: + notFollowedBy $ satisfy (\c -> c == '*' || c == '_' || c == ']') + -- We allow sentence punctuation except at the end, since + -- we don't want the trailing '.' in 'http://google.com.' We want to allow + -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) + -- as a URL, while NOT picking up the closing paren in + -- (http://wikipedia.org). So we include balanced parens in the URL. + str <- T.concat <$> many1 (uriChunkBetween '(' ')' + <|> uriChunkBetween '{' '}' + <|> uriChunkBetween '[' ']' + <|> T.pack <$> uriChunk) + str' <- option str $ char '/' >> return (str <> "/") + let uri' = scheme <> ":" <> fromEntities str' + return (uri', escapeURI uri') + where + isWordChar '#' = True + isWordChar '$' = True + isWordChar '%' = True + isWordChar '+' = True + isWordChar '/' = True + isWordChar '@' = True + isWordChar '\\' = True + isWordChar '_' = True + isWordChar '-' = True + isWordChar '&' = True + isWordChar '=' = True + isWordChar c = isAlphaNum c + + wordChar = satisfy isWordChar + percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit + entity = try $ pure <$> characterReference + punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) + uriChunk = many1 wordChar + <|> percentEscaped + <|> entity + <|> try (punct <* lookAhead (void wordChar <|> void percentEscaped)) + uriChunkBetween l r = try $ do chunk <- between (char l) (char r) uriChunk + return (T.pack $ [l] ++ chunk ++ [r]) + +-- | Applies a parser, returns tuple of its results and its horizontal +-- displacement (the difference between the source column at the end +-- and the source column at the beginning). Vertical displacement +-- (source row) is ignored. +withHorizDisplacement :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m a -- ^ Parser to apply + -> ParserT s st m (a, Int) -- ^ (result, displacement) +withHorizDisplacement parser = do + pos1 <- getPosition + result <- parser + pos2 <- getPosition + return (result, sourceColumn pos2 - sourceColumn pos1) + +-- | Applies a parser and returns the raw string that was parsed, +-- along with the value produced by the parser. +withRaw :: Monad m + => ParsecT Sources st m a + -> ParsecT Sources st m (a, Text) +withRaw parser = do + inps1 <- getInput + result <- parser + inps2 <- getInput + -- 'raw' is the difference between inps1 and inps2 + return (result, sourcesDifference inps1 inps2) + +sourcesDifference :: Sources -> Sources -> Text +sourcesDifference (Sources is1) (Sources is2) = go is1 is2 + where + go inps1 inps2 = + case (inps1, inps2) of + ([], _) -> mempty + (_, []) -> mconcat $ map snd inps1 + ((p1,t1):rest1, (p2, t2):rest2) + | p1 == p2 + , t1 == t2 -> go rest1 rest2 + | p1 == p2 + , t1 /= t2 -> fromMaybe mempty $ T.stripSuffix t2 t1 + | otherwise -> t1 <> go rest1 inps2 + +-- | Parses backslash, then applies character parser. +escaped :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char -- ^ Parser for character to escape + -> ParserT s st m Char +escaped parser = try $ char '\\' >> parser + +-- | Parse character entity. +characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char +characterReference = try $ do + char '&' + ent <- many1Till nonspaceChar (char ';') + let ent' = case ent of + '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug + '#':_ -> ent + _ -> ent ++ ";" + case lookupEntity ent' of + Just (c : _) -> return c + _ -> Prelude.fail "entity not found" + +-- | Parses a character reference and returns a Str element. +charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inline +charRef = Str . T.singleton <$> characterReference + +lineBlockLine :: Monad m => ParserT Sources st m Text +lineBlockLine = try $ do + char '|' + char ' ' + white <- T.pack <$> many (spaceChar >> return '\160') + notFollowedBy newline + line <- anyLine + continuations <- many (try $ char ' ' >> anyLine) + return $ white <> T.unwords (line : continuations) + +blankLineBlockLine :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char +blankLineBlockLine = try (char '|' >> blankline) + +-- | Parses an RST-style line block and returns a list of strings. +lineBlockLines :: Monad m => ParserT Sources st m [Text] +lineBlockLines = try $ do + lines' <- many1 (lineBlockLine <|> (T.singleton <$> blankLineBlockLine)) + skipMany blankline + return lines' + + +-- | Removes the ParsecT layer from the monad transformer stack +readWithM :: (Monad m, ToSources t) + => ParserT Sources st m a -- ^ parser + -> st -- ^ initial state + -> t -- ^ input + -> m (Either PandocError a) +readWithM parser state input = + mapLeft (PandocParsecError sources) + <$> runParserT parser state (initialSourceName sources) sources + where + sources = toSources input + +-- | Parse a string with a given parser and state +readWith :: ToSources t + => Parser Sources st a + -> st + -> t + -> Either PandocError a +readWith p t inp = runIdentity $ readWithM p t inp + +-- | Parse a string with @parser@ (for testing). +testStringWith :: Show a + => ParserT Sources ParserState Identity a + -> Text + -> IO () +testStringWith parser str = UTF8.putStrLn $ tshow $ + readWith parser defaultParserState (toSources str) + +-- | Add header to the list of headers in state, together +-- with its associated identifier. If the identifier is null +-- and the auto_identifiers extension is set, generate a new +-- unique identifier, and update the list of identifiers +-- in state. Issue a warning if an explicit identifier +-- is encountered that duplicates an earlier identifier +-- (explicit or automatically generated). +registerHeader :: (Stream s m a, HasReaderOptions st, + HasLogMessages st, HasIdentifierList st) + => Attr -> Inlines -> ParserT s st m Attr +registerHeader (ident,classes,kvs) header' = do + ids <- extractIdentifierList <$> getState + exts <- getOption readerExtensions + if T.null ident && Ext_auto_identifiers `extensionEnabled` exts + then do + let id' = uniqueIdent exts (B.toList header') ids + let id'' = if Ext_ascii_identifiers `extensionEnabled` exts + then toAsciiText id' + else id' + updateState $ updateIdentifierList $ Set.insert id' + updateState $ updateIdentifierList $ Set.insert id'' + return (id'',classes,kvs) + else do + unless (T.null ident) $ do + when (ident `Set.member` ids) $ do + pos <- getPosition + logMessage $ DuplicateIdentifier ident pos + updateState $ updateIdentifierList $ Set.insert ident + return (ident,classes,kvs) + +-- This is used to prevent exponential blowups for things like: +-- a**a*a**a*a**a*a**a*a**a*a**a*a** +nested :: Stream s m a + => ParserT s ParserState m a + -> ParserT s ParserState m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +token :: (Stream s m t) + => (t -> Text) + -> (t -> SourcePos) + -> (t -> Maybe a) + -> ParsecT s st m a +token pp pos match = tokenPrim (T.unpack . pp) (\_ t _ -> pos t) match + +infixr 5 <+?> +(<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a +a <+?> b = a >>= flip fmap (try b <|> return mempty) . mappend + +extractIdClass :: Attr -> Attr +extractIdClass (ident, cls, kvs) = (ident', cls', kvs') + where + ident' = fromMaybe ident (lookup "id" kvs) + cls' = maybe cls T.words $ lookup "class" kvs + kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs + +insertIncludedFile :: (PandocMonad m, HasIncludeFiles st) + => ParserT a st m b -- ^ parser to apply + -> (Text -> a) -- ^ convert Text to stream type + -> [FilePath] -- ^ search path (directories) + -> FilePath -- ^ path of file to include + -> Maybe Int -- ^ start line (negative counts from end) + -> Maybe Int -- ^ end line (negative counts from end) + -> ParserT a st m b +insertIncludedFile parser toStream dirs f mbstartline mbendline = do + oldPos <- getPosition + oldInput <- getInput + containers <- getIncludeFiles <$> getState + when (T.pack f `elem` containers) $ + throwError $ PandocParseError $ T.pack $ "Include file loop at " ++ show oldPos + updateState $ addIncludeFile $ T.pack f + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return $ exciseLines mbstartline mbendline s + Nothing -> do + report $ CouldNotLoadIncludeFile (T.pack f) oldPos + return "" + setInput $ toStream contents + setPosition $ newPos f (fromMaybe 1 mbstartline) 1 + result <- parser + setInput oldInput + setPosition oldPos + updateState dropLatestIncludeFile + return result + +exciseLines :: Maybe Int -> Maybe Int -> Text -> Text +exciseLines Nothing Nothing t = t +exciseLines mbstartline mbendline t = + T.unlines $ take (endline' - (startline' - 1)) + $ drop (startline' - 1) contentLines + where + contentLines = T.lines t + numLines = length contentLines + startline' = case mbstartline of + Nothing -> 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + endline' = case mbendline of + Nothing -> numLines + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end diff --git a/src/Text/Pandoc/Parsing/GridTable.hs b/src/Text/Pandoc/Parsing/GridTable.hs new file mode 100644 index 000000000..ebc12197f --- /dev/null +++ b/src/Text/Pandoc/Parsing/GridTable.hs @@ -0,0 +1,217 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Parsing.GridTable + Copyright : Copyright (C) 2006-2022 John MacFarlane + License : GPL-2.0-or-later + Maintainer : John MacFarlane <[email protected]> + +Shared parsers for plaintext tables. +-} +module Text.Pandoc.Parsing.GridTable + ( gridTableWith + , gridTableWith' + , tableWith + , tableWith' + , widthsFromIndices + ) +where + +import Control.Monad (guard) +import Data.List (transpose) +import Data.Text (Text) +import Text.Pandoc.Options (ReaderOptions (readerColumns)) +import Text.Pandoc.Builder (Blocks) +import Text.Pandoc.Definition +import Text.Pandoc.Parsing.Capabilities +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.Types +import Text.Pandoc.Shared (compactify, splitTextByIndices, trim, trimr) +import Text.Pandoc.Sources +import Text.Parsec + ( Stream (..), many1, notFollowedBy, option, optional, sepEndBy1, try ) + +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B + +-- | Parse a grid table: starts with row of '-' on top, then header +-- (which may be grid), then the rows, which may be grid, separated by +-- blank lines, and ending with a footer (dashed line followed by blank +-- line). +gridTableWith :: (Monad m, Monad mf, HasLastStrPosition st, HasReaderOptions st) + => ParserT Sources st m (mf Blocks) -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT Sources st m (mf Blocks) +gridTableWith blocks headless = + tableWith (gridTableHeader headless blocks) (gridTableRow blocks) + (gridTableSep '-') gridTableFooter + +gridTableWith' :: (Monad m, Monad mf, + HasReaderOptions st, HasLastStrPosition st) + => ParserT Sources st m (mf Blocks) -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT Sources st m (TableComponents mf) +gridTableWith' blocks headless = + tableWith' (gridTableHeader headless blocks) (gridTableRow blocks) + (gridTableSep '-') gridTableFooter + +gridTableSplitLine :: [Int] -> Text -> [Text] +gridTableSplitLine indices line = map removeFinalBar $ tail $ + splitTextByIndices (init indices) $ trimr line + +gridPart :: Monad m => Char -> ParserT Sources st m ((Int, Int), Alignment) +gridPart ch = do + leftColon <- option False (True <$ char ':') + dashes <- many1 (char ch) + rightColon <- option False (True <$ char ':') + char '+' + let lengthDashes = length dashes + (if leftColon then 1 else 0) + + (if rightColon then 1 else 0) + let alignment = case (leftColon, rightColon) of + (True, True) -> AlignCenter + (True, False) -> AlignLeft + (False, True) -> AlignRight + (False, False) -> AlignDefault + return ((lengthDashes, lengthDashes + 1), alignment) + +gridDashedLines :: Monad m => Char -> ParserT Sources st m [((Int, Int), Alignment)] +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline + +removeFinalBar :: Text -> Text +removeFinalBar = T.dropWhileEnd go . T.dropWhileEnd (=='|') + where + go c = T.any (== c) " \t" + +-- | Separator between rows of grid table. +gridTableSep :: Monad m => Char -> ParserT Sources st m Char +gridTableSep ch = try $ gridDashedLines ch >> return '\n' + +-- | Parse header for a grid table. +gridTableHeader :: (Monad m, Monad mf, HasLastStrPosition st) + => Bool -- ^ Headerless table + -> ParserT Sources st m (mf Blocks) + -> ParserT Sources st m (mf [Blocks], [Alignment], [Int]) +gridTableHeader True _ = do + optional blanklines + dashes <- gridDashedLines '-' + let aligns = map snd dashes + let lines' = map (snd . fst) dashes + let indices = scanl (+) 0 lines' + return (return [], aligns, indices) +gridTableHeader False blocks = try $ do + optional blanklines + dashes <- gridDashedLines '-' + rawContent <- many1 (notFollowedBy (gridTableSep '=') >> char '|' >> + T.pack <$> many1Till anyChar newline) + underDashes <- gridDashedLines '=' + guard $ length dashes == length underDashes + let lines' = map (snd . fst) underDashes + let indices = scanl (+) 0 lines' + let aligns = map snd underDashes + let rawHeads = map (T.unlines . map trim) $ transpose + $ map (gridTableSplitLine indices) rawContent + heads <- sequence <$> mapM (parseFromString' blocks . trim) rawHeads + return (heads, aligns, indices) + +gridTableRawLine :: (Stream s m Char, UpdateSourcePos s Char) => [Int] -> ParserT s st m [Text] +gridTableRawLine indices = do + char '|' + line <- many1Till anyChar newline + return (gridTableSplitLine indices $ T.pack line) + +-- | Parse row of grid table. +gridTableRow :: (Monad m, Monad mf, HasLastStrPosition st) + => ParserT Sources st m (mf Blocks) + -> [Int] + -> ParserT Sources st m (mf [Blocks]) +gridTableRow blocks indices = do + colLines <- many1 (gridTableRawLine indices) + let cols = map ((<> "\n") . T.unlines . removeOneLeadingSpace) $ + transpose colLines + compactifyCell bs = case compactify [bs] of + [] -> mempty + x:_ -> x + cells <- sequence <$> mapM (parseFromString' blocks) cols + return $ fmap (map compactifyCell) cells + +removeOneLeadingSpace :: [Text] -> [Text] +removeOneLeadingSpace xs = + if all startsWithSpace xs + then map (T.drop 1) xs + else xs + where startsWithSpace t = case T.uncons t of + Nothing -> True + Just (c, _) -> c == ' ' + +-- | Parse footer for a grid table. +gridTableFooter :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () +gridTableFooter = optional blanklines + +--- + +-- | Parse a table using 'headerParser', 'rowParser', +-- 'lineParser', and 'footerParser'. +tableWith :: (Stream s m Char, UpdateSourcePos s Char, + HasReaderOptions st, Monad mf) + => ParserT s st m (mf [Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s st m (mf [Blocks])) + -> ParserT s st m sep + -> ParserT s st m end + -> ParserT s st m (mf Blocks) +tableWith headerParser rowParser lineParser footerParser = try $ do + (aligns, widths, heads, rows) <- tableWith' headerParser rowParser + lineParser footerParser + let th = TableHead nullAttr <$> heads + tb = (:[]) . TableBody nullAttr 0 [] <$> rows + tf = pure $ TableFoot nullAttr [] + return $ B.table B.emptyCaption (zip aligns (map fromWidth widths)) <$> th <*> tb <*> tf + where + fromWidth n + | n > 0 = ColWidth n + | otherwise = ColWidthDefault + +type TableComponents mf = ([Alignment], [Double], mf [Row], mf [Row]) + +tableWith' :: (Stream s m Char, UpdateSourcePos s Char, + HasReaderOptions st, Monad mf) + => ParserT s st m (mf [Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s st m (mf [Blocks])) + -> ParserT s st m sep + -> ParserT s st m end + -> ParserT s st m (TableComponents mf) +tableWith' headerParser rowParser lineParser footerParser = try $ do + (heads, aligns, indices) <- headerParser + lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser + footerParser + numColumns <- getOption readerColumns + let widths = if null indices + then replicate (length aligns) 0.0 + else widthsFromIndices numColumns indices + let toRow = Row nullAttr . map B.simpleCell + toHeaderRow l = [toRow l | not (null l)] + return (aligns, widths, toHeaderRow <$> heads, map toRow <$> lines') + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Double] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns' indices = + let numColumns = max numColumns' (if null indices then 0 else last indices) + lengths' = zipWith (-) indices (0:indices) + lengths = reverse $ + case reverse lengths' of + [] -> [] + [x] -> [x] + -- compensate for the fact that intercolumn + -- spaces are counted in widths of all columns + -- but the last... + (x:y:zs) -> if x < y && y - x <= 2 + then y:y:zs + else x:y:zs + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> fromIntegral l / quotient) lengths in + tail fracs diff --git a/src/Text/Pandoc/Parsing/Lists.hs b/src/Text/Pandoc/Parsing/Lists.hs new file mode 100644 index 000000000..d3f2a2e6f --- /dev/null +++ b/src/Text/Pandoc/Parsing/Lists.hs @@ -0,0 +1,215 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Lists +Copyright : © 2006-2022 John MacFarlane +License : GPL-2.0-or-later +Maintainer : John MacFarlane <[email protected]> + +Parsers for list markers. +-} + +module Text.Pandoc.Parsing.Lists + ( anyOrderedListMarker + , decimal + , indentWith + , lowerAlpha + , lowerRoman + , orderedListMarker + , romanNumeral + , upperAlpha + , upperRoman + ) +where + +import Data.Char + ( isAsciiUpper + , isAsciiLower + , ord + , toLower + ) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Definition + ( ListNumberDelim(..) + , ListAttributes + , ListNumberStyle(..) + ) +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Sources +import Text.Parsec + ( (<|>) + , Stream(..) + , choice + , getState + , lookAhead + , many + , many1 + , option + , try + , updateState + ) +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.State +import Text.Pandoc.Parsing.Types (ParserT) + +import qualified Data.Map as M +import qualified Data.Text as T + +-- | Parses a roman numeral (uppercase or lowercase), returns number. +romanNumeral :: (Stream s m Char, UpdateSourcePos s Char) => Bool -- ^ Uppercase if true + -> ParserT s st m Int +romanNumeral upperCase = do + let rchar uc = char $ if upperCase then uc else toLower uc + let one = rchar 'I' + let five = rchar 'V' + let ten = rchar 'X' + let fifty = rchar 'L' + let hundred = rchar 'C' + let fivehundred = rchar 'D' + let thousand = rchar 'M' + lookAhead $ choice [one, five, ten, fifty, hundred, fivehundred, thousand] + thousands <- (1000 *) . length <$> many thousand + ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 + fivehundreds <- option 0 $ 500 <$ fivehundred + fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 + hundreds <- (100 *) . length <$> many hundred + nineties <- option 0 $ try $ ten >> hundred >> return 90 + fifties <- option 0 (50 <$ fifty) + forties <- option 0 $ try $ ten >> fifty >> return 40 + tens <- (10 *) . length <$> many ten + nines <- option 0 $ try $ one >> ten >> return 9 + fives <- option 0 (5 <$ five) + fours <- option 0 $ try $ one >> five >> return 4 + ones <- length <$> many one + let total = thousands + ninehundreds + fivehundreds + fourhundreds + + hundreds + nineties + fifties + forties + tens + nines + + fives + fours + ones + if total == 0 + then Prelude.fail "not a roman numeral" + else return total + +-- | Parses an uppercase roman numeral and returns (UpperRoman, number). +upperRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +upperRoman = do + num <- romanNumeral True + return (UpperRoman, num) + +-- | Parses a lowercase roman numeral and returns (LowerRoman, number). +lowerRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +lowerRoman = do + num <- romanNumeral False + return (LowerRoman, num) + +-- | Parses a decimal numeral and returns (Decimal, number). +decimal :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +decimal = do + num <- many1 digit + return (Decimal, fromMaybe 1 $ safeRead $ T.pack num) + +-- | Parses a '@' and optional label and +-- returns (DefaultStyle, [next example number]). The next +-- example number is incremented in parser state, and the label +-- (if present) is added to the label table. +exampleNum :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s ParserState m (ListNumberStyle, Int) +exampleNum = do + char '@' + lab <- mconcat . map T.pack <$> + many (many1 alphaNum <|> + try (do c <- char '_' <|> char '-' + cs <- many1 alphaNum + return (c:cs))) + st <- getState + let num = stateNextExample st + let newlabels = if T.null lab + then stateExamples st + else M.insert lab num $ stateExamples st + updateState $ \s -> s{ stateNextExample = num + 1 + , stateExamples = newlabels } + return (Example, num) + +-- | Parses a '#' returns (DefaultStyle, 1). +defaultNum :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +defaultNum = do + char '#' + return (DefaultStyle, 1) + +-- | Parses a lowercase letter and returns (LowerAlpha, number). +lowerAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +lowerAlpha = do + ch <- satisfy isAsciiLower + return (LowerAlpha, ord ch - ord 'a' + 1) + +-- | Parses an uppercase letter and returns (UpperAlpha, number). +upperAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +upperAlpha = do + ch <- satisfy isAsciiUpper + return (UpperAlpha, ord ch - ord 'A' + 1) + +-- | Parses a roman numeral i or I +romanOne :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) +romanOne = (char 'i' >> return (LowerRoman, 1)) <|> + (char 'I' >> return (UpperRoman, 1)) + +-- | Parses an ordered list marker and returns list attributes. +anyOrderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s ParserState m ListAttributes +anyOrderedListMarker = choice + [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], + numParser <- [decimal, exampleNum, defaultNum, romanOne, + lowerAlpha, lowerRoman, upperAlpha, upperRoman]] + +-- | Parses a list number (num) followed by a period, returns list attributes. +inPeriod :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inPeriod num = try $ do + (style, start) <- num + char '.' + let delim = if style == DefaultStyle + then DefaultDelim + else Period + return (start, style, delim) + +-- | Parses a list number (num) followed by a paren, returns list attributes. +inOneParen :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inOneParen num = try $ do + (style, start) <- num + char ')' + return (start, style, OneParen) + +-- | Parses a list number (num) enclosed in parens, returns list attributes. +inTwoParens :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inTwoParens num = try $ do + char '(' + (style, start) <- num + char ')' + return (start, style, TwoParens) + +-- | Parses an ordered list marker with a given style and delimiter, +-- returns number. +orderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) + => ListNumberStyle + -> ListNumberDelim + -> ParserT s ParserState m Int +orderedListMarker style delim = do + let num = defaultNum <|> -- # can continue any kind of list + case style of + DefaultStyle -> decimal + Example -> exampleNum + Decimal -> decimal + UpperRoman -> upperRoman + LowerRoman -> lowerRoman + UpperAlpha -> upperAlpha + LowerAlpha -> lowerAlpha + let context = case delim of + DefaultDelim -> inPeriod + Period -> inPeriod + OneParen -> inOneParen + TwoParens -> inTwoParens + (start, _, _) <- context num + return start + diff --git a/src/Text/Pandoc/Parsing/Math.hs b/src/Text/Pandoc/Parsing/Math.hs new file mode 100644 index 000000000..f234f9123 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Math.hs @@ -0,0 +1,96 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Math +Copyright : © 2006-2022 John MacFarlane +License : GPL-2.0-or-later +Maintainer : John MacFarlane <[email protected]> + +Parsing of LaTeX math. +-} + +module Text.Pandoc.Parsing.Math + ( mathDisplay + , mathInline + ) +where + +import Control.Monad (mzero, when) +import Data.Text (Text) +import Text.Parsec ((<|>), Stream(..), notFollowedBy, skipMany, try) +import Text.Pandoc.Options + ( Extension(Ext_tex_math_dollars, Ext_tex_math_single_backslash, + Ext_tex_math_double_backslash) ) +import Text.Pandoc.Parsing.Capabilities (HasReaderOptions, guardEnabled) +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.Types (ParserT) +import Text.Pandoc.Shared (trimMath) +import Text.Pandoc.Sources + (UpdateSourcePos, anyChar, char, digit, newline, satisfy, space, string) + +import qualified Data.Text as T + +mathInlineWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text +mathInlineWith op cl = try $ do + textStr op + when (op == "$") $ notFollowedBy space + words' <- many1Till ( + (T.singleton <$> + satisfy (\c -> not (isSpaceChar c || c == '\\'))) + <|> (char '\\' >> + -- This next clause is needed because \text{..} can + -- contain $, \(\), etc. + (try (string "text" >> + (("\\text" <>) <$> inBalancedBraces 0 "")) + <|> (\c -> T.pack ['\\',c]) <$> anyChar)) + <|> do (blankline <* notFollowedBy' blankline) <|> + (spaceChar <* skipMany spaceChar) + notFollowedBy (char '$') + return " " + ) (try $ textStr cl) + notFollowedBy digit -- to prevent capture of $5 + return $ trimMath $ T.concat words' + where + inBalancedBraces :: (Stream s m Char, UpdateSourcePos s Char) => Int -> Text -> ParserT s st m Text + inBalancedBraces n = fmap T.pack . inBalancedBraces' n . T.unpack + + inBalancedBraces' :: (Stream s m Char, UpdateSourcePos s Char) => Int -> String -> ParserT s st m String + inBalancedBraces' 0 "" = do + c <- anyChar + if c == '{' + then inBalancedBraces' 1 "{" + else mzero + inBalancedBraces' 0 s = return $ reverse s + inBalancedBraces' numOpen ('\\':xs) = do + c <- anyChar + inBalancedBraces' numOpen (c:'\\':xs) + inBalancedBraces' numOpen xs = do + c <- anyChar + case c of + '}' -> inBalancedBraces' (numOpen - 1) (c:xs) + '{' -> inBalancedBraces' (numOpen + 1) (c:xs) + _ -> inBalancedBraces' numOpen (c:xs) + +mathDisplayWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text +mathDisplayWith op cl = try $ fmap T.pack $ do + textStr op + many1Till (satisfy (/= '\n') <|> (newline <* notFollowedBy' blankline)) + (try $ textStr cl) + +mathDisplay :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text +mathDisplay = + (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathDisplayWith "\\[" "\\]") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathDisplayWith "\\\\[" "\\\\]") + +mathInline :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text +mathInline = + (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathInlineWith "\\(" "\\)") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathInlineWith "\\\\(" "\\\\)") diff --git a/src/Text/Pandoc/Parsing/Smart.hs b/src/Text/Pandoc/Parsing/Smart.hs new file mode 100644 index 000000000..58254d2a6 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Smart.hs @@ -0,0 +1,145 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Combinators +Copyright : © 2006-2022 John MacFarlane +License : GPL-2.0-or-later +Maintainer : John MacFarlane <[email protected]> + +Smart parsing of quotes, dashes, and other character combinations. +-} + +module Text.Pandoc.Parsing.Smart + ( apostrophe + , dash + , doubleCloseQuote + , doubleQuoteEnd + , doubleQuoteStart + , doubleQuoted + , ellipses + , singleQuoteEnd + , singleQuoteStart + , singleQuoted + , smartPunctuation + ) +where + +import Control.Monad (guard , void) +import Text.Pandoc.Builder (Inlines) +import Text.Pandoc.Options + ( extensionEnabled + , Extension(Ext_old_dashes, Ext_smart) + , ReaderOptions(readerExtensions) ) +import Text.Pandoc.Sources +import Text.Pandoc.Parsing.Capabilities +import Text.Pandoc.Parsing.Combinators +import Text.Pandoc.Parsing.Types (ParserT) +import Text.Parsec + ( (<|>) + , Stream(..) + , choice + , lookAhead + , manyTill + , notFollowedBy + , try + ) + +import qualified Text.Pandoc.Builder as B + +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, + HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +smartPunctuation inlineParser = do + guardEnabled Ext_smart + choice [ quoted inlineParser, apostrophe, doubleCloseQuote, dash, ellipses ] + +quoted :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser + +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +singleQuoted inlineParser = do + singleQuoteStart + (B.singleQuoted . mconcat <$> + try + (withQuoteContext InSingleQuote (many1Till inlineParser singleQuoteEnd))) + <|> pure "\8217" + +doubleQuoted :: (HasQuoteContext st m, HasLastStrPosition st, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +doubleQuoted inlineParser = do + doubleQuoteStart + (B.doubleQuoted . mconcat <$> + try + (withQuoteContext InDoubleQuote (manyTill inlineParser doubleQuoteEnd))) + <|> pure (B.str "\8220") + +charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParserT s st m Char +charOrRef cs = + oneOf cs <|> try (do c <- characterReference + guard (c `elem` cs) + return c) + +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + -- single quote start can't be right after str + guard =<< notAfterString + try $ do + charOrRef "'\8216\145" + void $ lookAhead (satisfy (not . isSpaceChar)) + +singleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () +singleQuoteEnd = try $ do + charOrRef "'\8217\146" + notFollowedBy alphaNum + +doubleQuoteStart :: (HasLastStrPosition st, + HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + guard =<< notAfterString + try $ do charOrRef "\"\8220\147" + void $ lookAhead (satisfy (not . isSpaceChar)) + +doubleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () +doubleQuoteEnd = void (charOrRef "\"\8221\148") + +apostrophe :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines +apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\8217") + +doubleCloseQuote :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines +doubleCloseQuote = B.str "\8221" <$ char '"' + +ellipses :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines +ellipses = try (string "..." >> return (B.str "\8230")) + +dash :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Inlines +dash = try $ do + oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions + if oldDashes + then do + char '-' + (char '-' >> return (B.str "\8212")) + <|> (lookAhead digit >> return (B.str "\8211")) + else do + string "--" + (char '-' >> return (B.str "\8212")) + <|> return (B.str "\8211") diff --git a/src/Text/Pandoc/Parsing/State.hs b/src/Text/Pandoc/Parsing/State.hs new file mode 100644 index 000000000..93eca0c50 --- /dev/null +++ b/src/Text/Pandoc/Parsing/State.hs @@ -0,0 +1,191 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Parsing + Copyright : Copyright (C) 2006-2022 John MacFarlane + License : GPL-2.0-or-later + Maintainer : John MacFarlane <[email protected]> + +A default parser state with commonly used properties. +-} + +module Text.Pandoc.Parsing.State + ( ParserState (..) + , ParserContext (..) + , HeaderType (..) + , NoteTable + , NoteTable' + , Key (..) + , KeyTable + , SubstTable + , defaultParserState + , toKey + ) +where + +import Data.Default (Default (def)) +import Data.Text (Text) +import Text.Parsec (SourcePos, getState, setState) +import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines) +import Text.Pandoc.Definition (Attr, Meta, Target, nullMeta) +import Text.Pandoc.Logging (LogMessage) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Parsing.Capabilities +import Text.Pandoc.Parsing.Types +import Text.Pandoc.Readers.LaTeX.Types (Macro) + +import qualified Data.Map as M +import qualified Data.Set as Set +import qualified Data.Text as T + +-- | Parsing options. +data ParserState = ParserState + { stateOptions :: ReaderOptions -- ^ User options + , stateParserContext :: ParserContext -- ^ Inside list? + , stateQuoteContext :: QuoteContext -- ^ Inside quoted environment? + , stateAllowLinks :: Bool -- ^ Allow parsing of links + , stateAllowLineBreaks :: Bool -- ^ Allow parsing of line breaks + , stateMaxNestingLevel :: Int -- ^ Max # of nested Strong/Emph + , stateLastStrPos :: Maybe SourcePos -- ^ Position after last str parsed + , stateKeys :: KeyTable -- ^ List of reference keys + , stateHeaderKeys :: KeyTable -- ^ List of implicit header ref keys + , stateSubstitutions :: SubstTable -- ^ List of substitution references + , stateNotes :: NoteTable -- ^ List of notes (raw bodies) + , stateNotes' :: NoteTable' -- ^ List of notes (parsed bodies) + , stateNoteRefs :: Set.Set Text -- ^ List of note references used + , stateInNote :: Bool -- ^ True if parsing note contents + , stateNoteNumber :: Int -- ^ Last note number for citations + , stateMeta :: Meta -- ^ Document metadata + , stateMeta' :: Future ParserState Meta -- ^ Document metadata + , stateCitations :: M.Map Text Text -- ^ RST-style citations + , stateHeaderTable :: [HeaderType] -- ^ Ordered list of header types used + , stateIdentifiers :: Set.Set Text -- ^ Header identifiers used + , stateNextExample :: Int -- ^ Number of next example + , stateExamples :: M.Map Text Int -- ^ Map from example labels to numbers + , stateMacros :: M.Map Text Macro -- ^ Table of macros defined so far + , stateRstDefaultRole :: Text -- ^ Current rST default + -- interpreted text role + , stateRstHighlight :: Maybe Text -- ^ Current rST literal block + -- language + , stateRstCustomRoles :: M.Map Text (Text, Maybe Text, Attr) + -- ^ Current rST cust text roles; + -- Triple represents:) Base role 2) Optional format (only for :raw: + -- roles) 3) Addition classes (rest of Attr is unused)). + , stateCaption :: Maybe Inlines -- ^ Caption in current environment + , stateInHtmlBlock :: Maybe Text -- ^ Tag type of HTML block being parsed + , stateFencedDivLevel :: Int -- ^ Depth of fenced div + , stateContainers :: [Text] -- ^ parent include files + , stateLogMessages :: [LogMessage] -- ^ log messages + , stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context + } + +instance Default ParserState where + def = defaultParserState + +instance HasMeta ParserState where + setMeta field val st = + st{ stateMeta = setMeta field val $ stateMeta st } + deleteMeta field st = + st{ stateMeta = deleteMeta field $ stateMeta st } + +instance HasReaderOptions ParserState where + extractReaderOptions = stateOptions + +instance Monad m => HasQuoteContext ParserState m where + getQuoteContext = stateQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +instance HasIdentifierList ParserState where + extractIdentifierList = stateIdentifiers + updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } + +instance HasMacros ParserState where + extractMacros = stateMacros + updateMacros f st = st{ stateMacros = f $ stateMacros st } + +instance HasLastStrPosition ParserState where + setLastStrPos pos st = st{ stateLastStrPos = pos } + getLastStrPos st = stateLastStrPos st + +instance HasLogMessages ParserState where + addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st } + getLogMessages st = reverse $ stateLogMessages st + +instance HasIncludeFiles ParserState where + getIncludeFiles = stateContainers + addIncludeFile f s = s{ stateContainers = f : stateContainers s } + dropLatestIncludeFile s = s { stateContainers = drop 1 $ stateContainers s } + +data ParserContext + = ListItemState -- ^ Used when running parser on list item contents + | NullState -- ^ Default state + deriving (Eq, Show) + +data HeaderType + = SingleHeader Char -- ^ Single line of characters underneath + | DoubleHeader Char -- ^ Lines of characters above and below + deriving (Eq, Show) + +defaultParserState :: ParserState +defaultParserState = ParserState + { stateOptions = def + , stateParserContext = NullState + , stateQuoteContext = NoQuote + , stateAllowLinks = True + , stateAllowLineBreaks = True + , stateMaxNestingLevel = 6 + , stateLastStrPos = Nothing + , stateKeys = M.empty + , stateHeaderKeys = M.empty + , stateSubstitutions = M.empty + , stateNotes = [] + , stateNotes' = M.empty + , stateNoteRefs = Set.empty + , stateInNote = False + , stateNoteNumber = 0 + , stateMeta = nullMeta + , stateMeta' = return nullMeta + , stateCitations = M.empty + , stateHeaderTable = [] + , stateIdentifiers = Set.empty + , stateNextExample = 1 + , stateExamples = M.empty + , stateMacros = M.empty + , stateRstDefaultRole = "title-reference" + , stateRstHighlight = Nothing + , stateRstCustomRoles = M.empty + , stateCaption = Nothing + , stateInHtmlBlock = Nothing + , stateFencedDivLevel = 0 + , stateContainers = [] + , stateLogMessages = [] + , stateMarkdownAttribute = False + } + +type NoteTable = [(Text, Text)] + +type NoteTable' = M.Map Text (SourcePos, Future ParserState Blocks) +-- used in markdown reader + +newtype Key = Key Text deriving (Show, Read, Eq, Ord) + +toKey :: Text -> Key +toKey = Key . T.toLower . T.unwords . T.words . unbracket + where unbracket t + | Just ('[', t') <- T.uncons t + , Just (t'', ']') <- T.unsnoc t' + = t'' + | otherwise + = t + +type KeyTable = M.Map Key (Target, Attr) + +type SubstTable = M.Map Key Inlines diff --git a/src/Text/Pandoc/Parsing/Types.hs b/src/Text/Pandoc/Parsing/Types.hs new file mode 100644 index 000000000..6e2e3e912 --- /dev/null +++ b/src/Text/Pandoc/Parsing/Types.hs @@ -0,0 +1,57 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- | + Module : Text.Pandoc.Parsing + Copyright : Copyright (C) 2006-2022 John MacFarlane + License : GPL-2.0-or-later + Maintainer : John MacFarlane <[email protected]> + +Types and type-related functions for parsers. +-} + +module Text.Pandoc.Parsing.Types + ( Parser + , ParserT + , Future (..) + , runF + , askF + , asksF + , returnF + ) +where + +import Control.Applicative (liftA2) +import Control.Monad.Reader + ( asks, runReader, MonadReader(ask), Reader, ReaderT(ReaderT) ) +import Text.Parsec ( Parsec , ParsecT ) + +-- | Generic parser type used by many pandoc readers. +type Parser t s = Parsec t s + +-- | Generic parser transformer used by many pandoc readers. +type ParserT = ParsecT + +-- | Reader monad wrapping the parser state. This is used to possibly +-- delay evaluation until all relevant information has been parsed and +-- made available in the parser state. +newtype Future s a = Future { runDelayed :: Reader s a } + deriving (Monad, Applicative, Functor) + +instance Semigroup a => Semigroup (Future s a) where + (<>) = liftA2 (<>) + +instance (Semigroup a, Monoid a) => Monoid (Future s a) where + mempty = return mempty + mappend = (<>) + +-- | Run a delayed action with the given state. +runF :: Future s a -> s -> a +runF = runReader . runDelayed + +askF :: Future s s +askF = Future ask + +asksF :: (s -> a) -> Future s a +asksF f = Future $ asks f + +returnF :: Monad m => a -> m (Future s a) +returnF = return . return |
