aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2025-11-23 08:45:18 +0100
committerJohn MacFarlane <[email protected]>2025-11-29 17:04:42 +0100
commit8651ec3c28693c875d78017b4f278338417d176a (patch)
tree24065899ab03f7daaa628a04a8060f4b54c2e505 /src
parent525113c31ea07d35a5a1e436866c7ed51fc90ccf (diff)
Add asciidoc as an input format.
New exported module Text.Pandoc.Readers.AsciiDoc, exporting readAsciiDoc [API change]. The bulk of parsing is handled by the asciidoc library. Closes #1456.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/AsciiDoc.hs416
2 files changed, 419 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index beec98587..5fa0ebeb7 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -25,6 +25,7 @@ module Text.Pandoc.Readers
-- * Readers: converting /to/ Pandoc format
Reader (..)
, readers
+ , readAsciiDoc
, readDocx
, readPptx
, readXlsx
@@ -84,6 +85,7 @@ import Text.Pandoc.Error
import Text.Pandoc.Extensions
import qualified Text.Pandoc.Format as Format
import Text.Pandoc.Options
+import Text.Pandoc.Readers.AsciiDoc
import Text.Pandoc.Readers.CommonMark
import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.Creole
@@ -142,6 +144,7 @@ readers = [("native" , TextReader readNative)
,("markdown_mmd", TextReader readMarkdown)
,("commonmark" , TextReader readCommonMark)
,("commonmark_x" , TextReader readCommonMark)
+ ,("asciidoc" , TextReader readAsciiDoc)
,("creole" , TextReader readCreole)
,("dokuwiki" , TextReader readDokuWiki)
,("gfm" , TextReader readCommonMark)
diff --git a/src/Text/Pandoc/Readers/AsciiDoc.hs b/src/Text/Pandoc/Readers/AsciiDoc.hs
new file mode 100644
index 000000000..44e5d8c47
--- /dev/null
+++ b/src/Text/Pandoc/Readers/AsciiDoc.hs
@@ -0,0 +1,416 @@
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE FlexibleContexts #-}
+
+{- |
+ Module : Text.Pandoc.Readers.AsciiDoc
+ Copyright : Copyright (C) 2024 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Reads and evaluates a AsciiDoc document as a Pandoc AST.
+-}
+module Text.Pandoc.Readers.AsciiDoc
+ ( readAsciiDoc
+ )
+where
+
+import Text.Pandoc.Class
+import Text.Pandoc.Options
+import Text.Pandoc.Definition
+import Text.Pandoc.Walk
+import Text.Pandoc.Shared (addPandocAttributes, blocksToInlines, safeRead,
+ tshow)
+import qualified Text.Pandoc.UTF8 as UTF8
+import qualified AsciiDoc as A
+import Text.Pandoc.Error
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Readers.HTML (readHtml)
+import Control.Monad.Except (throwError)
+import Control.Monad (mplus)
+import Text.Pandoc.Parsing (newPos, sourceName)
+import Text.Pandoc.Logging
+import Text.Pandoc.Sources
+import Control.Monad.State
+import Data.List (intersperse, foldl')
+import Data.Char (chr, ord)
+import qualified Data.Text as T
+import qualified Data.Map as M
+import Data.Maybe (fromMaybe)
+
+-- import Debug.Trace
+
+-- | Read AsciiDoc from an input string and return a Pandoc document.
+readAsciiDoc :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc
+readAsciiDoc _opts inp = do
+ let Sources sources = toSources inp
+ (mconcat <$> mapM
+ (\(sourcepos, t) ->
+ A.parseDocument getIncludeFile raiseError (sourceName sourcepos) t)
+ sources)
+ >>= resolveFootnotes
+ >>= resolveStem
+ >>= resolveIcons
+ >>= toPandoc
+ where
+ getIncludeFile fp = UTF8.toText <$> readFileStrict fp
+ raiseError fp pos msg = throwError $ PandocParseError $ T.pack
+ $ msg <> " at " <> show fp <>
+ " char " <> show pos
+
+toPandoc :: PandocMonad m => A.Document -> m Pandoc
+toPandoc doc =
+ Pandoc <$> doMeta (A.docMeta doc)
+ <*> (B.toList <$> doBlocks (A.docBlocks doc))
+
+resolveFootnotes :: Monad m => A.Document -> m A.Document
+resolveFootnotes doc = do
+ evalStateT (A.mapInlines go doc) (mempty :: M.Map T.Text [A.Inline])
+ where
+ go (A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils)) = do
+ fnmap <- get
+ case M.lookup fnid fnmap of
+ Just ils' ->
+ pure $ A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils')
+ Nothing -> do
+ put $ M.insert fnid ils fnmap
+ pure $ A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils)
+ go x = pure x
+
+resolveStem :: Monad m => A.Document -> m A.Document
+resolveStem doc = do
+ let defaultType = case M.lookup "stem" (A.docAttributes (A.docMeta doc)) of
+ Just "asciimath" -> A.AsciiMath
+ _ -> A.LaTeXMath
+ let doInlineStem (A.Inline attr (A.Math Nothing t)) =
+ pure $ A.Inline attr (A.Math (Just defaultType) t)
+ doInlineStem x = pure x
+ let doBlockStem (A.Block attr mbtit (A.MathBlock Nothing t)) =
+ pure $ A.Block attr mbtit (A.MathBlock (Just defaultType) t)
+ doBlockStem x = A.mapInlines doInlineStem x
+ A.mapBlocks doBlockStem doc
+
+-- resolve icons as either characters in an icon font or images
+resolveIcons :: Monad m => A.Document -> m A.Document
+resolveIcons doc = A.mapInlines fromIcon doc
+ where
+ docattrs = A.docAttributes (A.docMeta doc)
+ iconFont = case M.lookup "icons" docattrs of
+ Just "font" -> True
+ _ -> False
+ iconsdir = fromMaybe "./images/icons" $ M.lookup "iconsdir" docattrs
+ icontype = fromMaybe "png" $ M.lookup "icontype" docattrs
+ addClasses cls (A.Attr ps kvs) =
+ A.Attr ps $
+ case M.lookup "role" kvs of
+ Just r -> M.insert "role" (T.unwords (r : cls)) kvs
+ Nothing -> M.insert "role" (T.unwords cls) kvs
+ fromIcon (A.Inline attr (A.Icon name)) =
+ if iconFont
+ then pure $
+ A.Inline (addClasses ["fa", "fa-" <> name] attr) (A.Span [])
+ else pure $ -- default is to use an image
+ A.Inline (addClasses ["icon"] attr)
+ (A.InlineImage
+ (A.Target
+ (iconsdir <> "/" <> name <> "." <> icontype))
+ Nothing Nothing Nothing)
+ fromIcon x = pure x
+
+addAttribution :: Maybe A.Attribution -> B.Blocks -> B.Blocks
+addAttribution Nothing bs = bs
+addAttribution (Just (A.Attribution t)) bs = B.fromList $
+ case B.toList bs of
+ [B.Div attr bls] -> [B.Div attr (bls ++ [attrBlock])]
+ [B.BlockQuote bls] -> [B.BlockQuote (bls ++ [attrBlock])]
+ xs -> xs ++ [attrBlock]
+ where
+ attrBlock = Para (B.toList $ B.text $ "\x2014 " <> t)
+
+doMeta :: PandocMonad m => A.Meta -> m B.Meta
+doMeta meta = do
+ tit' <- doInlines (A.docTitle meta)
+ pure $
+ (if tit' == mempty
+ then id
+ else B.setMeta "title" tit') .
+ (case A.docAuthors meta of
+ [] -> id
+ as -> B.setMeta "author" (map fromAuthor as)) .
+ (case A.docRevision meta of
+ Nothing -> id
+ Just (A.Revision vers mbdate mbremark) ->
+ B.setMeta "version" vers .
+ maybe id (B.setMeta "date") mbdate .
+ maybe id (B.setMeta "remark") mbremark) .
+ flip (foldl' (\m (k,v) ->
+ -- leave out flags that are set just for processing
+ if k == "sectids" || k == "stem"
+ then m
+ else if T.null v
+ then B.setMeta k True m
+ else B.setMeta k v m))
+ (M.toList (A.docAttributes meta))
+ $ mempty
+
+fromAuthor :: A.Author -> B.Inlines
+fromAuthor au = B.text (A.authorName au) <>
+ maybe mempty (\email ->
+ " (" <> B.link ("mailto:" <> email) "" (B.str email) <> ")")
+ (A.authorEmail au)
+
+doBlocks :: PandocMonad m => [A.Block] -> m B.Blocks
+doBlocks = fmap mconcat . mapM doBlock
+
+addBlockAttr :: A.Attr -> B.Blocks -> B.Blocks
+addBlockAttr (A.Attr _ kvs') bs =
+ case B.toList bs of
+ x@(B.OrderedList{}) : xs -> -- "start" is handled in list attribs
+ addPandocAttributes (M.toList $ M.delete "start" kvs)
+ (B.singleton x) <> B.fromList xs
+ x:xs -> addPandocAttributes (M.toList kvs) (B.singleton x)
+ <> B.fromList xs
+ [] -> mempty
+ where
+ kvs = M.mapKeys (\k -> if k == "role" then "class" else k) kvs'
+
+addBlockTitle :: B.Inlines -> B.Blocks -> B.Blocks
+addBlockTitle tit' bs =
+ let tit = B.toList tit'
+ in case B.toList bs of
+ [B.Table attr _ colspecs thead tbody tfoot] ->
+ B.singleton $ B.Table attr (B.Caption Nothing [B.Plain tit])
+ colspecs thead tbody tfoot
+ [B.Figure attr _ bs'] ->
+ B.singleton $ B.Figure attr (B.Caption Nothing [B.Plain tit]) bs'
+ [B.Div attr (B.Div ("",["title"],[]) [Para _] : bs')] ->
+ -- replace existing title, which might be e.g. "Note"
+ B.singleton $ B.Div attr (B.Div ("",["title"],[]) [B.Para tit] : bs')
+ [B.Div attr bs'] -> -- put title Div inside
+ B.singleton $ B.Div attr (B.Div ("",["title"],[]) [B.Para tit] : bs')
+ _ -> B.divWith B.nullAttr (B.divWith ("",["title"],[]) (B.para tit') <> bs)
+
+doBlock :: PandocMonad m => A.Block -> m B.Blocks
+doBlock (A.Block attr@(A.Attr ps kvs) mbtitle bt) = do
+ mbtitle' <- case mbtitle of
+ Nothing -> pure Nothing
+ Just (A.BlockTitle ils) -> Just <$> doInlines ils
+ addBlockAttr attr . maybe id addBlockTitle mbtitle' <$>
+ case bt of
+ A.Section (A.Level lev) ils bs -> do
+ ils' <- doInlines ils
+ bs' <- doBlocks bs
+ pure $ (B.header lev ils') <> bs'
+ A.DiscreteHeading (A.Level lev) ils ->
+ B.header lev <$> doInlines ils
+ A.Paragraph ils -> B.para <$> doInlines ils
+ A.LiteralBlock t -> pure $ B.codeBlock t
+ A.Listing mblang lns -> do
+ let fromCallout (A.Callout i)
+ | i <= 20 = T.pack [' ', chr (0x2460 + i - 1)]
+ | otherwise = "<" <> tshow i <> ">"
+ let fromSourceLine (A.SourceLine t callouts) =
+ t <> mconcat (map fromCallout callouts)
+ let code = T.intercalate "\n" $ map fromSourceLine lns
+ let classes = case mblang of
+ Nothing -> []
+ Just (A.Language l) -> [l]
+ pure $ B.codeBlockWith ("", classes, []) code
+ A.IncludeListing _ _ Nothing -> pure mempty
+ A.IncludeListing mblang _fp (Just lns) ->
+ doBlock (A.Block mempty mbtitle (A.Listing mblang lns))
+ A.ExampleBlock bs -> B.divWith ("",["example"],[]) <$> doBlocks bs
+ A.Sidebar bs -> B.divWith ("",["sidebar"],[]) <$> doBlocks bs
+ A.OpenBlock bs -> B.divWith ("",[],[]) <$> doBlocks bs
+ A.QuoteBlock mbattrib bs ->
+ addAttribution mbattrib . B.blockQuote <$> doBlocks bs
+ A.Verse mbattrib bs ->
+ addAttribution mbattrib . B.blockQuote <$> doBlocks bs
+ -- TODO when texmath's asciimath parser works, convert:
+ A.MathBlock (Just A.AsciiMath) t -> pure $ B.para $ B.displayMath t
+ A.MathBlock (Just A.LaTeXMath) t -> pure $ B.para $ B.displayMath t
+ A.MathBlock Nothing _ ->
+ throwError $ PandocParseError "Encountered math type Nothing"
+ A.List (A.BulletList _) items ->
+ B.bulletList <$> mapM doItem items
+ A.List A.CheckList items ->
+ B.bulletList <$> mapM doItem items
+ A.List (A.OrderedList _ mbstart) items -> do
+ let start = fromMaybe (1 :: Int)
+ (mbstart `mplus` (M.lookup "start" kvs >>= safeRead))
+ let getStyle xs = case xs of
+ "arabic":_ -> Decimal
+ "decimal":_ -> Decimal
+ "loweralpha":_ -> LowerAlpha
+ "upperalpha":_ -> UpperAlpha
+ "lowerroman":_ -> LowerRoman
+ "upperroman":_ -> UpperRoman
+ _:rest -> getStyle rest
+ [] -> DefaultStyle
+ let sty = getStyle ps
+ let delim = DefaultDelim
+ B.orderedListWith (start, sty, delim) <$> mapM doItem items
+ A.List A.CalloutList items ->
+ B.divWith ("",["callout-list"],[]) . B.orderedList <$> mapM doItem items
+ A.DefinitionList items
+ | "ordered" `elem` ps ->
+ B.orderedList <$>
+ mapM (fmap (B.definitionList . (:[])) . doDefListItem) items
+ | otherwise -> B.definitionList <$> mapM doDefListItem items
+ A.Table specs mbHeader rows mbFooter -> do
+ let toAlign A.AlignLeft = B.AlignLeft
+ toAlign A.AlignCenter = B.AlignCenter
+ toAlign A.AlignRight = B.AlignRight
+ let fromCell (A.TableCell bs mbHorizAlign _mbVertAlign colspan rowspan) =
+ B.Cell B.nullAttr (maybe B.AlignDefault toAlign mbHorizAlign)
+ (B.RowSpan rowspan) (B.ColSpan colspan) . B.toList
+ <$> doBlocks bs
+ let fromRow (A.TableRow cs) = B.Row B.nullAttr <$> mapM fromCell cs
+ tbody <- B.TableBody B.nullAttr (B.RowHeadColumns 0) [] <$> mapM fromRow rows
+ thead <- B.TableHead B.nullAttr <$> maybe (pure []) (mapM fromRow) mbHeader
+ tfoot <- B.TableFoot B.nullAttr <$> maybe (pure []) (mapM fromRow) mbFooter
+ let totalWidth = sum $ map (fromMaybe 1 . A.colWidth) specs
+ let toColSpec spec = (maybe B.AlignDefault toAlign (A.colHorizAlign spec),
+ maybe B.ColWidthDefault
+ (B.ColWidth . (\x ->
+ fromIntegral x / fromIntegral totalWidth))
+ (A.colWidth spec))
+ let colspecs = map toColSpec specs
+ pure $ B.table (B.Caption Nothing mempty) -- added by addBlockTitle
+ colspecs thead [tbody] tfoot
+ A.BlockImage target mbalt mbw mbh -> do
+ img' <- doInline (A.Inline mempty (A.InlineImage target mbalt mbw mbh))
+ -- TODO have a global function that adds the title to caption here:
+ pure $ B.figure (Caption Nothing mempty) -- added by addBlockTitle
+ (B.plain img')
+ -- TODO alt text?
+ A.BlockAudio (A.Target t) ->
+ pure $ B.plain $ B.image t "" (B.str t)
+ -- TODO alt text?
+ A.BlockVideo (A.Target t) ->
+ pure $ B.plain $ B.image t "" (B.str t)
+ A.TOC -> pure $ B.divWith ("toc",[],[]) mempty
+ A.Admonition admonitionType bs -> do
+ let admon = T.pack $ show admonitionType
+ bs' <- doBlocks bs
+ pure $ B.divWith ("",[T.toLower admon],[])
+ $ B.divWith ("",["title"],[]) (B.para (B.str admon)) <> bs'
+ A.PageBreak ->
+ pure $ B.divWith ("", ["page-break"], [("wrapper", "1")]) B.horizontalRule
+ A.ThematicBreak -> pure $ B.horizontalRule
+ A.Include fp (Just bs) ->
+ B.divWith ("",["included"],[("path",T.pack fp)]) <$> doBlocks bs
+ A.Include fp Nothing -> do
+ report $ CouldNotLoadIncludeFile (T.pack fp) (newPos "" 0 0)
+ pure mempty
+ A.PassthroughBlock t ->
+ case runPure (readHtml def{
+ readerExtensions = extensionsFromList [Ext_raw_html]
+ } t) of
+ Left _ -> pure $ B.rawBlock "html" t
+ Right (Pandoc _ bs) -> pure $ B.fromList bs
+
+doItem :: PandocMonad m => A.ListItem -> m B.Blocks
+doItem (A.ListItem Nothing bs) = doBlocks bs
+doItem (A.ListItem (Just checkstate) bs) = do
+ bs' <- doBlocks bs
+ let check = case checkstate of
+ A.Checked -> Str "\9746"
+ A.Unchecked -> Str "\9744"
+ pure $ B.fromList
+ $ case B.toList bs' of
+ (B.Para ils : rest) -> B.Para (check : B.Space : ils) : rest
+ (B.Plain ils : rest) -> B.Plain (check : B.Space : ils) : rest
+ rest -> B.Para [check] : rest
+
+doDefListItem :: PandocMonad m
+ => ([A.Inline], [A.Block]) -> m (B.Inlines , [B.Blocks])
+doDefListItem (lab, bs) = do
+ lab' <- doInlines lab
+ bs' <- doBlocks bs
+ pure (lab', [bs'])
+
+doInlines :: PandocMonad m => [A.Inline] -> m B.Inlines
+doInlines = fmap mconcat . mapM doInline
+
+doInline :: PandocMonad m => A.Inline -> m B.Inlines
+doInline (A.Inline (A.Attr _ps kvs') it) = do
+ let kvs = M.mapKeys (\k -> if k == "role" then "class" else k) kvs'
+ addPandocAttributes (M.toList kvs) <$>
+ case it of
+ A.Str t -> pure $ B.text t
+ A.HardBreak -> pure B.linebreak
+ A.Bold ils -> B.strong <$> doInlines ils
+ A.Italic ils -> B.emph <$> doInlines ils
+ A.Monospace ils -> walk monospaceStr <$> doInlines ils
+ A.Superscript ils -> B.superscript <$> doInlines ils
+ A.Subscript ils -> B.subscript <$> doInlines ils
+ A.Highlight ils -> B.spanWith ("",["mark"],[]) <$> doInlines ils
+ A.Strikethrough ils -> B.strikeout <$> doInlines ils
+ A.DoubleQuoted ils -> B.doubleQuoted <$> doInlines ils
+ A.SingleQuoted ils -> B.singleQuoted <$> doInlines ils
+ -- TODO when texmath's asciimath parser works, convert:
+ A.Math (Just A.AsciiMath) t -> pure $ B.math t
+ A.Math (Just A.LaTeXMath) t -> pure $ B.math t
+ A.Math Nothing _ ->
+ throwError $ PandocParseError "Encountered math type Nothing"
+ A.Icon t -> pure $ B.spanWith ("",["icon"],[("name",t)])
+ (B.str ("[" <> t <> "]"))
+ A.Button t -> pure $ B.spanWith ("",["button"],[])
+ (B.strong $ B.str ("[" <> t <> "]"))
+ A.Kbd ts -> pure $ mconcat $ intersperse (B.str "+") $
+ map (B.spanWith ("",["kbd"],[]) . B.strong . B.str) ts
+ A.Menu ts -> pure $ B.spanWith ("",["menu"],[]) $
+ B.strong $ B.text $ T.intercalate " › " ts
+ -- TODO do we need linktype?
+ A.Link _linkType (A.Target t) ils -> B.link t "" <$> doInlines ils
+ A.InlineImage (A.Target url) mbalt mbwidth mbheight -> do
+ let alt = case mbalt of
+ Just (A.AltText t) -> B.text t
+ Nothing -> mempty
+ width = case mbwidth of
+ Just (A.Width n) -> [("width", T.pack $ show n <> "px")]
+ Nothing -> []
+ height = case mbheight of
+ Just (A.Height n) -> [("height", T.pack $ show n <> "px")]
+ Nothing -> []
+ pure $ B.imageWith ("",[], width ++ height) url "" alt
+ A.Footnote _ ils -> B.note . B.para <$> doInlines ils
+ A.InlineAnchor t _ -> pure $ B.spanWith (t, [], []) mempty
+ A.BibliographyAnchor t _ -> pure $ B.spanWith (t, [], []) mempty
+ A.CrossReference t Nothing ->
+ pure $ B.linkWith ("",["cross-reference"],[]) ("#" <> t) "" (B.str t)
+ A.CrossReference t (Just ils) -> do
+ B.linkWith ("",["cross-reference"],[]) ("#" <> t) "" <$> doInlines ils
+ A.AttributeReference (A.AttributeName t) -> -- if this is here, it's unresolved
+ pure $ B.str ("{" <> t <> "}")
+ A.Span ils -> B.spanWith B.nullAttr <$> doInlines ils
+ A.IndexEntry (A.TermInText t) ->
+ pure $ B.spanWith ("",["index"],[("term",t)]) (B.text t)
+ A.IndexEntry (A.TermConcealed ts) ->
+ pure $ B.spanWith ("",["index"],[("term",T.intercalate "," ts)]) mempty
+ A.Counter name ctype val ->
+ pure $ B.spanWith ("",["counter"],[("name",name)]) $ B.str $
+ case ctype of
+ A.DecimalCounter -> tshow val
+ A.UpperAlphaCounter -> T.singleton $ chr (ord 'A' + val - 1)
+ A.LowerAlphaCounter -> T.singleton $ chr (ord 'a' + val - 1)
+ -- Passthrough is hard to get right, because pandoc's RawInline needs
+ -- a format specifier. Often in asciidoc passthrough is used as a form
+ -- of escaping, so the best approach seems to be treating it as HTML
+ -- and parsing it:
+ A.Passthrough t -> do
+ case runPure (readHtml def{
+ readerExtensions = extensionsFromList [Ext_raw_html]
+ } t) of
+ Left _ -> pure $ B.rawInline "html" t
+ Right (Pandoc _ bs) -> pure $ B.fromList . blocksToInlines $ bs
+
+monospaceStr :: Inline -> Inline
+monospaceStr (Str t) = Code B.nullAttr t
+monospaceStr x = x