aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2024-10-12 10:17:06 -0600
committerJohn MacFarlane <[email protected]>2024-10-13 15:51:05 -0600
commitc8fda8f4d38a05fd494e7ee74d9c75930bdbcf12 (patch)
tree823a25f9c39eac66397dbeef950a6f6d7fde01b6
parent32c1a31f61b012aaae784138fb2a049ccdd496ad (diff)
RST reader: Use a new one-pass parsing strategy.
Instead of having an initial pass where we collect reference definitions, we create links with target `##SUBST##something` or `##REF##something` or `##NOTE##something`, and resolve these in a pass over the parsed AST. This allows us to handle link references that are not at the top level. Closes #10281.
-rw-r--r--src/Text/Pandoc/Readers/RST.hs211
-rw-r--r--test/command/10281.md68
-rw-r--r--test/command/512.md2
3 files changed, 170 insertions, 111 deletions
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index 52b0b2544..879604e3f 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -40,6 +40,7 @@ import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Shared
import Text.Pandoc.URI
+import Text.Pandoc.Walk (walkM)
import qualified Text.Pandoc.UTF8 as UTF8
import Data.Time.Format
import System.FilePath (takeDirectory)
@@ -150,43 +151,79 @@ metaFromDefList ds meta = adjustAuthors $ foldr f meta ds
parseRST :: PandocMonad m => RSTParser m Pandoc
parseRST = do
+ standalone <- getOption readerStandalone
optional blanklines -- skip blank lines at beginning of file
- startPos <- getPosition
- -- go through once just to get list of reference keys and notes
- -- docMinusKeys is the raw document with blanks where the keys were...
- let chunk = referenceKey
- <|> anchorDef
- <|> noteBlock
- <|> citationBlock
- <|> (snd <$> withRaw comment)
- <|> headerBlock
- <|> lineClump
- docMinusKeys <- Sources <$>
- manyTill (do pos <- getPosition
- t <- chunk
- return (pos, t)) eof
- -- UGLY: we collapse source position information.
- -- TODO: fix the parser to use the F monad instead of two passes
- setInput docMinusKeys
- setPosition startPos
- st' <- getState
- let reversedNotes = stateNotes st'
- updateState $ \s -> s { stateNotes = reverse reversedNotes
- , stateIdentifiers = mempty }
- -- now parse it for real...
blocks <- B.toList <$> parseBlocks
citations <- sort . M.toList . stateCitations <$> getState
citationItems <- mapM parseCitation citations
let refBlock = [Div ("citations",[],[]) $
B.toList $ B.definitionList citationItems | not (null citationItems)]
- standalone <- getOption readerStandalone
state <- getState
let meta = stateMeta state
let (blocks', meta') = if standalone
then titleTransform (blocks, meta)
else (blocks, meta)
+ let reversedNotes = stateNotes state
+ updateState $ \s -> s { stateNotes = reverse reversedNotes }
+ doc <- walkM resolveReferences (Pandoc meta' (blocks' ++ refBlock))
reportLogMessages
- return $ Pandoc meta' (blocks' ++ refBlock)
+ return doc
+
+resolveReferences :: PandocMonad m => Inline -> RSTParser m Inline
+resolveReferences x@(Link _ ils (s,_))
+ | Just ref <- T.stripPrefix "##REF##" s = do
+ let isAnonKey (Key (T.uncons -> Just ('_',_))) = True
+ isAnonKey _ = False
+ state <- getState
+ let keyTable = stateKeys state
+ let anonKeys = sort $ filter isAnonKey $ M.keys keyTable
+ key <- if ref == "_" -- anonymous key
+ then
+ case anonKeys of
+ [] -> mzero -- TODO log?
+ (k:_) -> return k
+ else return $ toKey ref
+ ((src,tit), attr) <- lookupKey [] key
+ -- if anonymous link, remove key so it won't be used again
+ when (isAnonKey key) $ updateState $ \st ->
+ st{ stateKeys = M.delete key keyTable }
+ return $ Link attr ils (src, tit)
+ | Just ref <- T.stripPrefix "##NOTE##" s = do
+ state <- getState
+ let notes = stateNotes state
+ case lookup ref notes of
+ Nothing -> do
+ pos <- getPosition
+ logMessage $ ReferenceNotFound ref pos
+ return x
+ Just raw -> do
+ -- We temporarily empty the note list while parsing the note,
+ -- so that we don't get infinite loops with notes inside notes...
+ -- Note references inside other notes are allowed in reST, but
+ -- not yet in this implementation.
+ updateState $ \st -> st{ stateNotes = [] }
+ contents <- parseFromString' parseBlocks raw
+ let newnotes = if ref == "*" || ref == "#" -- auto-numbered
+ -- delete the note so the next auto-numbered note
+ -- doesn't get the same contents:
+ then deleteFirstsBy (==) notes [(ref,raw)]
+ else notes
+ updateState $ \st -> st{ stateNotes = newnotes }
+ return $ Note (B.toList contents)
+ | Just ref <- T.stripPrefix "##SUBST##" s = do
+ substTable <- stateSubstitutions <$> getState
+ let key = toKey $ stripFirstAndLast ref
+ case M.lookup key substTable of
+ Nothing -> do
+ pos <- getPosition
+ logMessage $ ReferenceNotFound (tshow key) pos
+ return x
+ Just target -> case
+ B.toList target of
+ [t] -> return t
+ ts -> return $ Span nullAttr ts
+ | otherwise = return x
+resolveReferences x = return x
parseCitation :: PandocMonad m
=> (Text, Text) -> RSTParser m (Inlines, [Blocks])
@@ -207,6 +244,9 @@ block :: PandocMonad m => RSTParser m Blocks
block = choice [ codeBlock
, blockQuote
, fieldList
+ , referenceKey
+ , noteBlock
+ , citationBlock
, directive
, anchor
, comment
@@ -309,7 +349,10 @@ doubleHeader = do
Just ind -> (headerTable, ind + 1)
Nothing -> (headerTable ++ [DoubleHeader c], length headerTable + 1)
setState (state { stateHeaderTable = headerTable' })
- attr <- registerHeader nullAttr txt
+ attr@(ident,_,_) <- registerHeader nullAttr txt
+ let key = toKey (stringify txt)
+ updateState $ \s ->
+ s { stateKeys = M.insert key (("#" <> ident,""), nullAttr) $ stateKeys s }
return $ B.headerWith attr level txt
doubleHeader' :: PandocMonad m => RSTParser m (Inlines, Char)
@@ -338,7 +381,10 @@ singleHeader = do
Just ind -> (headerTable, ind + 1)
Nothing -> (headerTable ++ [SingleHeader c], length headerTable + 1)
setState (state { stateHeaderTable = headerTable' })
- attr <- registerHeader nullAttr txt
+ attr@(ident,_,_) <- registerHeader nullAttr txt
+ let key = toKey (stringify txt)
+ updateState $ \s ->
+ s { stateKeys = M.insert key (("#" <> ident,""), nullAttr) $ stateKeys s }
return $ B.headerWith attr level txt
singleHeader' :: PandocMonad m => RSTParser m (Inlines, Char)
@@ -1059,27 +1105,24 @@ mkAttr ident classes fields = (ident, classes, fields')
--- note block
---
-noteBlock :: Monad m => RSTParser m Text
+noteBlock :: Monad m => RSTParser m Blocks
noteBlock = try $ do
- (ref, raw, replacement) <- noteBlock' noteMarker
+ (ref, raw) <- noteBlock' noteMarker
updateState $ \s -> s { stateNotes = (ref, raw) : stateNotes s }
- -- return blanks so line count isn't affected
- return replacement
+ return mempty
-citationBlock :: Monad m => RSTParser m Text
+citationBlock :: Monad m => RSTParser m Blocks
citationBlock = try $ do
- (ref, raw, replacement) <- noteBlock' citationMarker
+ (ref, raw) <- noteBlock' citationMarker
updateState $ \s ->
s { stateCitations = M.insert ref raw (stateCitations s),
stateKeys = M.insert (toKey ref) (("#" <> ref,""), ("",["citation"],[]))
(stateKeys s) }
- -- return blanks so line count isn't affected
- return replacement
+ return mempty
noteBlock' :: Monad m
- => RSTParser m Text -> RSTParser m (Text, Text, Text)
+ => RSTParser m Text -> RSTParser m (Text, Text)
noteBlock' marker = try $ do
- startPos <- getPosition
string ".."
spaceChar >> skipMany spaceChar
ref <- marker
@@ -1087,10 +1130,8 @@ noteBlock' marker = try $ do
<|> (newline >> return "")
blanks <- option "" blanklines
rest <- option "" indentedBlock
- endPos <- getPosition
let raw = first <> "\n" <> blanks <> rest <> "\n"
- let replacement = T.replicate (sourceLine endPos - sourceLine startPos) "\n"
- return (ref, raw, replacement)
+ return (ref, raw)
citationMarker :: Monad m => RSTParser m Text
citationMarker = do
@@ -1132,14 +1173,11 @@ simpleReferenceName = do
referenceName :: PandocMonad m => RSTParser m Text
referenceName = quotedReferenceName <|> simpleReferenceName
-referenceKey :: PandocMonad m => RSTParser m Text
+referenceKey :: PandocMonad m => RSTParser m Blocks
referenceKey = do
- startPos <- getPosition
choice [substKey, anonymousKey, regularKey]
optional blanklines
- endPos <- getPosition
- -- return enough blanks to replace key
- return $ T.replicate (sourceLine endPos - sourceLine startPos) "\n"
+ return mempty
targetURI :: Monad m => ParsecT Sources st m Text
targetURI = do
@@ -1217,19 +1255,14 @@ regularKey = try $ do
updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $
stateKeys s }
-anchorDef :: PandocMonad m => RSTParser m Text
-anchorDef = try $ do
- (refs, raw) <- withRaw $ try (referenceNames <* blanklines)
- forM_ refs $ \rawkey ->
- updateState $ \s -> s { stateKeys =
- M.insert (toKey rawkey) (("#" <> rawkey,""), nullAttr) $ stateKeys s }
- -- keep this for 2nd round of parsing, where we'll add the divs (anchor)
- return raw
-
anchor :: PandocMonad m => RSTParser m Blocks
anchor = try $ do
refs <- referenceNames
blanklines
+ forM_ refs $ \rawkey ->
+ updateState $ \s -> s { stateKeys =
+ M.insert (toKey rawkey) (("#" <> rawkey,""), nullAttr)
+ (stateKeys s) }
b <- block
let addDiv ref = B.divWith (ref, [], [])
let emptySpanWithId id' = Span (id',[],[]) []
@@ -1245,16 +1278,6 @@ anchor = try $ do
-- because it hides them from promoteHeader, see #4240
_ -> return $ foldr addDiv b refs
-headerBlock :: PandocMonad m => RSTParser m Text
-headerBlock = do
- ((txt, _), raw) <- withRaw (doubleHeader' <|> singleHeader')
- (ident,_,_) <- registerHeader nullAttr txt
- let key = toKey (stringify txt)
- updateState $ \s -> s { stateKeys = M.insert key (("#" <> ident,""), nullAttr)
- $ stateKeys s }
- return raw
-
-
--
-- tables
--
@@ -1628,25 +1651,17 @@ citationName = do
raw <- citationMarker
return $ "[" <> raw <> "]"
+-- We store the reference link label as the link target,
+-- preceded by '##REF##'. This is replaced after the AST
+-- has been built by the resolved reference.
referenceLink :: PandocMonad m => RSTParser m Inlines
referenceLink = try $ do
ref <- (referenceName <|> citationName) <* char '_'
- let label' = B.text ref
- let isAnonKey (Key (T.uncons -> Just ('_',_))) = True
- isAnonKey _ = False
- state <- getState
- let keyTable = stateKeys state
- key <- option (toKey ref) $
- do char '_'
- let anonKeys = sort $ filter isAnonKey $ M.keys keyTable
- case anonKeys of
- [] -> mzero
- (k:_) -> return k
- ((src,tit), attr) <- lookupKey [] key
- -- if anonymous link, remove key so it won't be used again
- when (isAnonKey key) $ updateState $ \s ->
- s{ stateKeys = M.delete key keyTable }
- return $ B.linkWith attr src tit label'
+ isAnonymous <- (True <$ char '_') <|> pure False
+ let ref' = if isAnonymous
+ then "_"
+ else ref
+ pure $ B.linkWith nullAttr ("##REF##" <> ref') "" (B.text ref)
-- We keep a list of oldkeys so we can detect lookup loops.
lookupKey :: PandocMonad m
@@ -1666,6 +1681,10 @@ lookupKey oldkeys key = do
let newkey = toKey rawkey
if newkey `elem` oldkeys
then do
+ -- TODO the pos is not going to be accurate
+ -- because we're calling this after the AST is
+ -- constructed. Probably good to remove that
+ -- parameter form CircularReference at some point.
logMessage $ CircularReference rawkey pos
return (("",""),nullAttr)
else lookupKey (key:oldkeys) newkey
@@ -1687,42 +1706,14 @@ autoLink = autoURI <|> autoEmail
subst :: PandocMonad m => RSTParser m Inlines
subst = try $ do
(_,ref) <- withRaw $ enclosed (char '|') (char '|') inline
- state <- getState
- let substTable = stateSubstitutions state
- let key = toKey $ stripFirstAndLast ref
- case M.lookup key substTable of
- Nothing -> do
- pos <- getPosition
- logMessage $ ReferenceNotFound (tshow key) pos
- return mempty
- Just target -> return target
+ pure $ B.linkWith nullAttr ("##SUBST##" <> ref) "" (B.text ref)
note :: PandocMonad m => RSTParser m Inlines
note = try $ do
optional whitespace
ref <- noteMarker
char '_'
- state <- getState
- let notes = stateNotes state
- case lookup ref notes of
- Nothing -> do
- pos <- getPosition
- logMessage $ ReferenceNotFound ref pos
- return mempty
- Just raw -> do
- -- We temporarily empty the note list while parsing the note,
- -- so that we don't get infinite loops with notes inside notes...
- -- Note references inside other notes are allowed in reST, but
- -- not yet in this implementation.
- updateState $ \st -> st{ stateNotes = [] }
- contents <- parseFromString' parseBlocks raw
- let newnotes = if ref == "*" || ref == "#" -- auto-numbered
- -- delete the note so the next auto-numbered note
- -- doesn't get the same contents:
- then deleteFirstsBy (==) notes [(ref,raw)]
- else notes
- updateState $ \st -> st{ stateNotes = newnotes }
- return $ B.note contents
+ pure $ B.linkWith nullAttr ("##NOTE##" <> ref) "" (B.text ref)
smart :: PandocMonad m => RSTParser m Inlines
smart = smartPunctuation inline
diff --git a/test/command/10281.md b/test/command/10281.md
new file mode 100644
index 000000000..a946061e7
--- /dev/null
+++ b/test/command/10281.md
@@ -0,0 +1,68 @@
+```
+% pandoc -f rst -t native
+`Want Speed? Pass by Value`_
+
+.. note::
+ For more information about the pass-by-value idiom, read: `Want Speed? Pass by Value`_.
+
+ .. _Want Speed? Pass by Value: https://web.archive.org/web/20140205194657/http://cpp-next.com/archive/2009/08/want-speed-pass-by-value/
+^D
+[ Para
+ [ Link
+ ( "" , [] , [] )
+ [ Str "Want"
+ , Space
+ , Str "Speed?"
+ , Space
+ , Str "Pass"
+ , Space
+ , Str "by"
+ , Space
+ , Str "Value"
+ ]
+ ( "https://web.archive.org/web/20140205194657/http://cpp-next.com/archive/2009/08/want-speed-pass-by-value/"
+ , ""
+ )
+ ]
+, Div
+ ( "" , [ "note" ] , [] )
+ [ Div ( "" , [ "title" ] , [] ) [ Para [ Str "Note" ] ]
+ , Para
+ [ Str "For"
+ , Space
+ , Str "more"
+ , Space
+ , Str "information"
+ , Space
+ , Str "about"
+ , Space
+ , Str "the"
+ , Space
+ , Str "pass-by-value"
+ , Space
+ , Str "idiom,"
+ , Space
+ , Str "read:"
+ , Space
+ , Link
+ ( "" , [] , [] )
+ [ Str "Want"
+ , Space
+ , Str "Speed?"
+ , Space
+ , Str "Pass"
+ , Space
+ , Str "by"
+ , Space
+ , Str "Value"
+ ]
+ ( "https://web.archive.org/web/20140205194657/http://cpp-next.com/archive/2009/08/want-speed-pass-by-value/"
+ , ""
+ )
+ , Str "."
+ ]
+ ]
+]
+
+```
+
diff --git a/test/command/512.md b/test/command/512.md
index 20053d9cd..48f1f101a 100644
--- a/test/command/512.md
+++ b/test/command/512.md
@@ -37,7 +37,7 @@ Loop detection:
__ link1_
^D
-2> [WARNING] Circular reference 'link1' at line 1 column 15
+2> [WARNING] Circular reference 'link1' at line 8 column 1
<p><a href="">click here</a></p>
```