aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Melnikov <[email protected]>2025-12-21 06:27:28 +0100
committerGitHub <[email protected]>2025-12-20 21:27:28 -0800
commitaf27e8a4215985065da82a9059f11dce4e9ab3f7 (patch)
treefd4f29e59073e49d86b48914c33a6690cd0d1cb7
parent144eb4e51bb6d077c38e94db62638edbc57df8c3 (diff)
MediaWiki reader: add behavior switches support (#11354)
MediaWiki format supports [magic words](https://www.mediawiki.org/wiki/Help:Magic_words). These are basically built-in templates. This commit introduces support for behavior switches, which is one of the three types of magic words. They add a field to metadata without producing any text. Signed-off-by: botantony <[email protected]>
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs56
-rw-r--r--test/command/mediawiki_behavior_switches.md16
2 files changed, 63 insertions, 9 deletions
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index 1112007c6..be7bdc4a2 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -13,7 +13,7 @@ Conversion of mediawiki text to 'Pandoc' document.
{-
TODO:
_ correctly handle tables within tables
-_ parse templates?
+_ parse templates(?) and built-in magic words
-}
module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
@@ -28,17 +28,17 @@ import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup
-import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
+import Text.Pandoc.Char (isCJK)
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Definition
import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Parsing hiding (tableCaption)
import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag, toAttr)
-import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines,
- trim, splitTextBy, tshow, formatCode)
-import Text.Pandoc.Char (isCJK)
+import Text.Pandoc.Shared (formatCode, safeRead, splitTextBy, stringify,
+ stripTrailingNewlines, trim, tshow)
import Text.Pandoc.XML (fromEntities)
-- | Read mediawiki from an input string and return a Pandoc document.
@@ -56,6 +56,7 @@ readMediaWiki opts s = do
, mwLogMessages = []
, mwInTT = False
, mwAllowNewlines = True
+ , mwMeta = nullMeta
}
sources
case parsed of
@@ -70,6 +71,7 @@ data MWState = MWState { mwOptions :: ReaderOptions
, mwLogMessages :: [LogMessage]
, mwInTT :: Bool
, mwAllowNewlines :: Bool
+ , mwMeta :: Meta
}
type MWParser m = ParsecT Sources MWState m
@@ -90,7 +92,7 @@ instance HasLogMessages MWState where
--
specialChars :: [Char]
-specialChars = "'[]<=&*{}|\":\\"
+specialChars = "'[]<=&*{}|\":\\_"
spaceChars :: [Char]
spaceChars = " \n\t"
@@ -110,9 +112,9 @@ isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) &&
isBlockTag' tag = isBlockTag tag
isInlineTag' :: Tag Text -> Bool
-isInlineTag' (TagComment _) = True
+isInlineTag' (TagComment _) = True
isInlineTag' (TagClose "ref") = False -- see below inlineTag
-isInlineTag' t = not (isBlockTag' t)
+isInlineTag' t = not (isBlockTag' t)
eitherBlockOrInline :: [Text]
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
@@ -160,11 +162,12 @@ parseMediaWiki = do
spaces
eof
categoryLinks <- reverse . mwCategoryLinks <$> getState
+ meta <- mwMeta <$> getState
let categories = if null categoryLinks
then mempty
else B.para $ mconcat $ intersperse B.space categoryLinks
reportLogMessages
- return $ B.doc $ bs <> categories
+ return $ Pandoc meta (B.toList bs <> B.toList categories)
--
-- block parsers
@@ -533,6 +536,7 @@ inline = whitespace
<|> doubleQuotes
<|> strong
<|> emph
+ <|> behaviorSwitch
<|> image
<|> internalLink
<|> externalLink
@@ -716,3 +720,37 @@ doubleQuotes = do
B.doubleQuoted <$> inlinesBetween openDoubleQuote closeDoubleQuote
where openDoubleQuote = sym "\"" >> lookAhead nonspaceChar
closeDoubleQuote = try $ sym "\""
+
+behaviorSwitch :: PandocMonad m => MWParser m Inlines
+behaviorSwitch = try $ do
+ let reservedMagicWords = [ "NOTOC"
+ , "FORCETOC"
+ , "TOC"
+ , "NOEDITSECTION"
+ , "NEWSECTIONLINK"
+ , "NONEWSECTIONLINK"
+ , "NOGALLERY"
+ , "HIDDENCAT"
+ , "EXPECTUNUSEDCATEGORY"
+ , "NOCONTENTCONVERT"
+ , "NOCC"
+ , "NOTITLECONVERT"
+ , "NOTC"
+ , "INDEX"
+ , "NOINDEX"
+ , "STATICREDIRECT"
+ , "EXPECTUNUSEDTEMPLATE"
+ -- From popular extensions
+ , "NOGLOBAL"
+ , "DISAMBIG"
+ , "ARCHIVEDTALK"
+ , "NOTALK"
+ ]
+ string "__"
+ name <- many1 alphaNum
+ string "__"
+ case name `elem` reservedMagicWords of
+ True -> do
+ updateState $ \st -> st{ mwMeta = B.setMeta (T.toLower $ T.pack name) True (mwMeta st) }
+ return mempty
+ False -> return $ B.str $ "__" <> T.pack name <> "__"
diff --git a/test/command/mediawiki_behavior_switches.md b/test/command/mediawiki_behavior_switches.md
new file mode 100644
index 000000000..3798df40b
--- /dev/null
+++ b/test/command/mediawiki_behavior_switches.md
@@ -0,0 +1,16 @@
+```
+% pandoc -f mediawiki -t native
+____INDEX____
+__NOINDEX__
+__noindex__
+__FOOBAR__
+^D
+[ Para
+ [ Str "____"
+ , SoftBreak
+ , Str "__noindex__"
+ , SoftBreak
+ , Str "__FOOBAR__"
+ ]
+]
+```