diff options
| author | John MacFarlane <[email protected]> | 2025-11-23 08:45:18 +0100 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-11-29 17:04:42 +0100 |
| commit | 8651ec3c28693c875d78017b4f278338417d176a (patch) | |
| tree | 24065899ab03f7daaa628a04a8060f4b54c2e505 /src | |
| parent | 525113c31ea07d35a5a1e436866c7ed51fc90ccf (diff) | |
Add asciidoc as an input format.
New exported module Text.Pandoc.Readers.AsciiDoc,
exporting readAsciiDoc [API change].
The bulk of parsing is handled by the asciidoc library.
Closes #1456.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/AsciiDoc.hs | 416 |
2 files changed, 419 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index beec98587..5fa0ebeb7 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -25,6 +25,7 @@ module Text.Pandoc.Readers -- * Readers: converting /to/ Pandoc format Reader (..) , readers + , readAsciiDoc , readDocx , readPptx , readXlsx @@ -84,6 +85,7 @@ import Text.Pandoc.Error import Text.Pandoc.Extensions import qualified Text.Pandoc.Format as Format import Text.Pandoc.Options +import Text.Pandoc.Readers.AsciiDoc import Text.Pandoc.Readers.CommonMark import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.Creole @@ -142,6 +144,7 @@ readers = [("native" , TextReader readNative) ,("markdown_mmd", TextReader readMarkdown) ,("commonmark" , TextReader readCommonMark) ,("commonmark_x" , TextReader readCommonMark) + ,("asciidoc" , TextReader readAsciiDoc) ,("creole" , TextReader readCreole) ,("dokuwiki" , TextReader readDokuWiki) ,("gfm" , TextReader readCommonMark) diff --git a/src/Text/Pandoc/Readers/AsciiDoc.hs b/src/Text/Pandoc/Readers/AsciiDoc.hs new file mode 100644 index 000000000..44e5d8c47 --- /dev/null +++ b/src/Text/Pandoc/Readers/AsciiDoc.hs @@ -0,0 +1,416 @@ +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} + +{- | + Module : Text.Pandoc.Readers.AsciiDoc + Copyright : Copyright (C) 2024 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <[email protected]> + Stability : alpha + Portability : portable + +Reads and evaluates a AsciiDoc document as a Pandoc AST. +-} +module Text.Pandoc.Readers.AsciiDoc + ( readAsciiDoc + ) +where + +import Text.Pandoc.Class +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Shared (addPandocAttributes, blocksToInlines, safeRead, + tshow) +import qualified Text.Pandoc.UTF8 as UTF8 +import qualified AsciiDoc as A +import Text.Pandoc.Error +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Readers.HTML (readHtml) +import Control.Monad.Except (throwError) +import Control.Monad (mplus) +import Text.Pandoc.Parsing (newPos, sourceName) +import Text.Pandoc.Logging +import Text.Pandoc.Sources +import Control.Monad.State +import Data.List (intersperse, foldl') +import Data.Char (chr, ord) +import qualified Data.Text as T +import qualified Data.Map as M +import Data.Maybe (fromMaybe) + +-- import Debug.Trace + +-- | Read AsciiDoc from an input string and return a Pandoc document. +readAsciiDoc :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc +readAsciiDoc _opts inp = do + let Sources sources = toSources inp + (mconcat <$> mapM + (\(sourcepos, t) -> + A.parseDocument getIncludeFile raiseError (sourceName sourcepos) t) + sources) + >>= resolveFootnotes + >>= resolveStem + >>= resolveIcons + >>= toPandoc + where + getIncludeFile fp = UTF8.toText <$> readFileStrict fp + raiseError fp pos msg = throwError $ PandocParseError $ T.pack + $ msg <> " at " <> show fp <> + " char " <> show pos + +toPandoc :: PandocMonad m => A.Document -> m Pandoc +toPandoc doc = + Pandoc <$> doMeta (A.docMeta doc) + <*> (B.toList <$> doBlocks (A.docBlocks doc)) + +resolveFootnotes :: Monad m => A.Document -> m A.Document +resolveFootnotes doc = do + evalStateT (A.mapInlines go doc) (mempty :: M.Map T.Text [A.Inline]) + where + go (A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils)) = do + fnmap <- get + case M.lookup fnid fnmap of + Just ils' -> + pure $ A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils') + Nothing -> do + put $ M.insert fnid ils fnmap + pure $ A.Inline attr (A.Footnote (Just (A.FootnoteId fnid)) ils) + go x = pure x + +resolveStem :: Monad m => A.Document -> m A.Document +resolveStem doc = do + let defaultType = case M.lookup "stem" (A.docAttributes (A.docMeta doc)) of + Just "asciimath" -> A.AsciiMath + _ -> A.LaTeXMath + let doInlineStem (A.Inline attr (A.Math Nothing t)) = + pure $ A.Inline attr (A.Math (Just defaultType) t) + doInlineStem x = pure x + let doBlockStem (A.Block attr mbtit (A.MathBlock Nothing t)) = + pure $ A.Block attr mbtit (A.MathBlock (Just defaultType) t) + doBlockStem x = A.mapInlines doInlineStem x + A.mapBlocks doBlockStem doc + +-- resolve icons as either characters in an icon font or images +resolveIcons :: Monad m => A.Document -> m A.Document +resolveIcons doc = A.mapInlines fromIcon doc + where + docattrs = A.docAttributes (A.docMeta doc) + iconFont = case M.lookup "icons" docattrs of + Just "font" -> True + _ -> False + iconsdir = fromMaybe "./images/icons" $ M.lookup "iconsdir" docattrs + icontype = fromMaybe "png" $ M.lookup "icontype" docattrs + addClasses cls (A.Attr ps kvs) = + A.Attr ps $ + case M.lookup "role" kvs of + Just r -> M.insert "role" (T.unwords (r : cls)) kvs + Nothing -> M.insert "role" (T.unwords cls) kvs + fromIcon (A.Inline attr (A.Icon name)) = + if iconFont + then pure $ + A.Inline (addClasses ["fa", "fa-" <> name] attr) (A.Span []) + else pure $ -- default is to use an image + A.Inline (addClasses ["icon"] attr) + (A.InlineImage + (A.Target + (iconsdir <> "/" <> name <> "." <> icontype)) + Nothing Nothing Nothing) + fromIcon x = pure x + +addAttribution :: Maybe A.Attribution -> B.Blocks -> B.Blocks +addAttribution Nothing bs = bs +addAttribution (Just (A.Attribution t)) bs = B.fromList $ + case B.toList bs of + [B.Div attr bls] -> [B.Div attr (bls ++ [attrBlock])] + [B.BlockQuote bls] -> [B.BlockQuote (bls ++ [attrBlock])] + xs -> xs ++ [attrBlock] + where + attrBlock = Para (B.toList $ B.text $ "\x2014 " <> t) + +doMeta :: PandocMonad m => A.Meta -> m B.Meta +doMeta meta = do + tit' <- doInlines (A.docTitle meta) + pure $ + (if tit' == mempty + then id + else B.setMeta "title" tit') . + (case A.docAuthors meta of + [] -> id + as -> B.setMeta "author" (map fromAuthor as)) . + (case A.docRevision meta of + Nothing -> id + Just (A.Revision vers mbdate mbremark) -> + B.setMeta "version" vers . + maybe id (B.setMeta "date") mbdate . + maybe id (B.setMeta "remark") mbremark) . + flip (foldl' (\m (k,v) -> + -- leave out flags that are set just for processing + if k == "sectids" || k == "stem" + then m + else if T.null v + then B.setMeta k True m + else B.setMeta k v m)) + (M.toList (A.docAttributes meta)) + $ mempty + +fromAuthor :: A.Author -> B.Inlines +fromAuthor au = B.text (A.authorName au) <> + maybe mempty (\email -> + " (" <> B.link ("mailto:" <> email) "" (B.str email) <> ")") + (A.authorEmail au) + +doBlocks :: PandocMonad m => [A.Block] -> m B.Blocks +doBlocks = fmap mconcat . mapM doBlock + +addBlockAttr :: A.Attr -> B.Blocks -> B.Blocks +addBlockAttr (A.Attr _ kvs') bs = + case B.toList bs of + x@(B.OrderedList{}) : xs -> -- "start" is handled in list attribs + addPandocAttributes (M.toList $ M.delete "start" kvs) + (B.singleton x) <> B.fromList xs + x:xs -> addPandocAttributes (M.toList kvs) (B.singleton x) + <> B.fromList xs + [] -> mempty + where + kvs = M.mapKeys (\k -> if k == "role" then "class" else k) kvs' + +addBlockTitle :: B.Inlines -> B.Blocks -> B.Blocks +addBlockTitle tit' bs = + let tit = B.toList tit' + in case B.toList bs of + [B.Table attr _ colspecs thead tbody tfoot] -> + B.singleton $ B.Table attr (B.Caption Nothing [B.Plain tit]) + colspecs thead tbody tfoot + [B.Figure attr _ bs'] -> + B.singleton $ B.Figure attr (B.Caption Nothing [B.Plain tit]) bs' + [B.Div attr (B.Div ("",["title"],[]) [Para _] : bs')] -> + -- replace existing title, which might be e.g. "Note" + B.singleton $ B.Div attr (B.Div ("",["title"],[]) [B.Para tit] : bs') + [B.Div attr bs'] -> -- put title Div inside + B.singleton $ B.Div attr (B.Div ("",["title"],[]) [B.Para tit] : bs') + _ -> B.divWith B.nullAttr (B.divWith ("",["title"],[]) (B.para tit') <> bs) + +doBlock :: PandocMonad m => A.Block -> m B.Blocks +doBlock (A.Block attr@(A.Attr ps kvs) mbtitle bt) = do + mbtitle' <- case mbtitle of + Nothing -> pure Nothing + Just (A.BlockTitle ils) -> Just <$> doInlines ils + addBlockAttr attr . maybe id addBlockTitle mbtitle' <$> + case bt of + A.Section (A.Level lev) ils bs -> do + ils' <- doInlines ils + bs' <- doBlocks bs + pure $ (B.header lev ils') <> bs' + A.DiscreteHeading (A.Level lev) ils -> + B.header lev <$> doInlines ils + A.Paragraph ils -> B.para <$> doInlines ils + A.LiteralBlock t -> pure $ B.codeBlock t + A.Listing mblang lns -> do + let fromCallout (A.Callout i) + | i <= 20 = T.pack [' ', chr (0x2460 + i - 1)] + | otherwise = "<" <> tshow i <> ">" + let fromSourceLine (A.SourceLine t callouts) = + t <> mconcat (map fromCallout callouts) + let code = T.intercalate "\n" $ map fromSourceLine lns + let classes = case mblang of + Nothing -> [] + Just (A.Language l) -> [l] + pure $ B.codeBlockWith ("", classes, []) code + A.IncludeListing _ _ Nothing -> pure mempty + A.IncludeListing mblang _fp (Just lns) -> + doBlock (A.Block mempty mbtitle (A.Listing mblang lns)) + A.ExampleBlock bs -> B.divWith ("",["example"],[]) <$> doBlocks bs + A.Sidebar bs -> B.divWith ("",["sidebar"],[]) <$> doBlocks bs + A.OpenBlock bs -> B.divWith ("",[],[]) <$> doBlocks bs + A.QuoteBlock mbattrib bs -> + addAttribution mbattrib . B.blockQuote <$> doBlocks bs + A.Verse mbattrib bs -> + addAttribution mbattrib . B.blockQuote <$> doBlocks bs + -- TODO when texmath's asciimath parser works, convert: + A.MathBlock (Just A.AsciiMath) t -> pure $ B.para $ B.displayMath t + A.MathBlock (Just A.LaTeXMath) t -> pure $ B.para $ B.displayMath t + A.MathBlock Nothing _ -> + throwError $ PandocParseError "Encountered math type Nothing" + A.List (A.BulletList _) items -> + B.bulletList <$> mapM doItem items + A.List A.CheckList items -> + B.bulletList <$> mapM doItem items + A.List (A.OrderedList _ mbstart) items -> do + let start = fromMaybe (1 :: Int) + (mbstart `mplus` (M.lookup "start" kvs >>= safeRead)) + let getStyle xs = case xs of + "arabic":_ -> Decimal + "decimal":_ -> Decimal + "loweralpha":_ -> LowerAlpha + "upperalpha":_ -> UpperAlpha + "lowerroman":_ -> LowerRoman + "upperroman":_ -> UpperRoman + _:rest -> getStyle rest + [] -> DefaultStyle + let sty = getStyle ps + let delim = DefaultDelim + B.orderedListWith (start, sty, delim) <$> mapM doItem items + A.List A.CalloutList items -> + B.divWith ("",["callout-list"],[]) . B.orderedList <$> mapM doItem items + A.DefinitionList items + | "ordered" `elem` ps -> + B.orderedList <$> + mapM (fmap (B.definitionList . (:[])) . doDefListItem) items + | otherwise -> B.definitionList <$> mapM doDefListItem items + A.Table specs mbHeader rows mbFooter -> do + let toAlign A.AlignLeft = B.AlignLeft + toAlign A.AlignCenter = B.AlignCenter + toAlign A.AlignRight = B.AlignRight + let fromCell (A.TableCell bs mbHorizAlign _mbVertAlign colspan rowspan) = + B.Cell B.nullAttr (maybe B.AlignDefault toAlign mbHorizAlign) + (B.RowSpan rowspan) (B.ColSpan colspan) . B.toList + <$> doBlocks bs + let fromRow (A.TableRow cs) = B.Row B.nullAttr <$> mapM fromCell cs + tbody <- B.TableBody B.nullAttr (B.RowHeadColumns 0) [] <$> mapM fromRow rows + thead <- B.TableHead B.nullAttr <$> maybe (pure []) (mapM fromRow) mbHeader + tfoot <- B.TableFoot B.nullAttr <$> maybe (pure []) (mapM fromRow) mbFooter + let totalWidth = sum $ map (fromMaybe 1 . A.colWidth) specs + let toColSpec spec = (maybe B.AlignDefault toAlign (A.colHorizAlign spec), + maybe B.ColWidthDefault + (B.ColWidth . (\x -> + fromIntegral x / fromIntegral totalWidth)) + (A.colWidth spec)) + let colspecs = map toColSpec specs + pure $ B.table (B.Caption Nothing mempty) -- added by addBlockTitle + colspecs thead [tbody] tfoot + A.BlockImage target mbalt mbw mbh -> do + img' <- doInline (A.Inline mempty (A.InlineImage target mbalt mbw mbh)) + -- TODO have a global function that adds the title to caption here: + pure $ B.figure (Caption Nothing mempty) -- added by addBlockTitle + (B.plain img') + -- TODO alt text? + A.BlockAudio (A.Target t) -> + pure $ B.plain $ B.image t "" (B.str t) + -- TODO alt text? + A.BlockVideo (A.Target t) -> + pure $ B.plain $ B.image t "" (B.str t) + A.TOC -> pure $ B.divWith ("toc",[],[]) mempty + A.Admonition admonitionType bs -> do + let admon = T.pack $ show admonitionType + bs' <- doBlocks bs + pure $ B.divWith ("",[T.toLower admon],[]) + $ B.divWith ("",["title"],[]) (B.para (B.str admon)) <> bs' + A.PageBreak -> + pure $ B.divWith ("", ["page-break"], [("wrapper", "1")]) B.horizontalRule + A.ThematicBreak -> pure $ B.horizontalRule + A.Include fp (Just bs) -> + B.divWith ("",["included"],[("path",T.pack fp)]) <$> doBlocks bs + A.Include fp Nothing -> do + report $ CouldNotLoadIncludeFile (T.pack fp) (newPos "" 0 0) + pure mempty + A.PassthroughBlock t -> + case runPure (readHtml def{ + readerExtensions = extensionsFromList [Ext_raw_html] + } t) of + Left _ -> pure $ B.rawBlock "html" t + Right (Pandoc _ bs) -> pure $ B.fromList bs + +doItem :: PandocMonad m => A.ListItem -> m B.Blocks +doItem (A.ListItem Nothing bs) = doBlocks bs +doItem (A.ListItem (Just checkstate) bs) = do + bs' <- doBlocks bs + let check = case checkstate of + A.Checked -> Str "\9746" + A.Unchecked -> Str "\9744" + pure $ B.fromList + $ case B.toList bs' of + (B.Para ils : rest) -> B.Para (check : B.Space : ils) : rest + (B.Plain ils : rest) -> B.Plain (check : B.Space : ils) : rest + rest -> B.Para [check] : rest + +doDefListItem :: PandocMonad m + => ([A.Inline], [A.Block]) -> m (B.Inlines , [B.Blocks]) +doDefListItem (lab, bs) = do + lab' <- doInlines lab + bs' <- doBlocks bs + pure (lab', [bs']) + +doInlines :: PandocMonad m => [A.Inline] -> m B.Inlines +doInlines = fmap mconcat . mapM doInline + +doInline :: PandocMonad m => A.Inline -> m B.Inlines +doInline (A.Inline (A.Attr _ps kvs') it) = do + let kvs = M.mapKeys (\k -> if k == "role" then "class" else k) kvs' + addPandocAttributes (M.toList kvs) <$> + case it of + A.Str t -> pure $ B.text t + A.HardBreak -> pure B.linebreak + A.Bold ils -> B.strong <$> doInlines ils + A.Italic ils -> B.emph <$> doInlines ils + A.Monospace ils -> walk monospaceStr <$> doInlines ils + A.Superscript ils -> B.superscript <$> doInlines ils + A.Subscript ils -> B.subscript <$> doInlines ils + A.Highlight ils -> B.spanWith ("",["mark"],[]) <$> doInlines ils + A.Strikethrough ils -> B.strikeout <$> doInlines ils + A.DoubleQuoted ils -> B.doubleQuoted <$> doInlines ils + A.SingleQuoted ils -> B.singleQuoted <$> doInlines ils + -- TODO when texmath's asciimath parser works, convert: + A.Math (Just A.AsciiMath) t -> pure $ B.math t + A.Math (Just A.LaTeXMath) t -> pure $ B.math t + A.Math Nothing _ -> + throwError $ PandocParseError "Encountered math type Nothing" + A.Icon t -> pure $ B.spanWith ("",["icon"],[("name",t)]) + (B.str ("[" <> t <> "]")) + A.Button t -> pure $ B.spanWith ("",["button"],[]) + (B.strong $ B.str ("[" <> t <> "]")) + A.Kbd ts -> pure $ mconcat $ intersperse (B.str "+") $ + map (B.spanWith ("",["kbd"],[]) . B.strong . B.str) ts + A.Menu ts -> pure $ B.spanWith ("",["menu"],[]) $ + B.strong $ B.text $ T.intercalate " › " ts + -- TODO do we need linktype? + A.Link _linkType (A.Target t) ils -> B.link t "" <$> doInlines ils + A.InlineImage (A.Target url) mbalt mbwidth mbheight -> do + let alt = case mbalt of + Just (A.AltText t) -> B.text t + Nothing -> mempty + width = case mbwidth of + Just (A.Width n) -> [("width", T.pack $ show n <> "px")] + Nothing -> [] + height = case mbheight of + Just (A.Height n) -> [("height", T.pack $ show n <> "px")] + Nothing -> [] + pure $ B.imageWith ("",[], width ++ height) url "" alt + A.Footnote _ ils -> B.note . B.para <$> doInlines ils + A.InlineAnchor t _ -> pure $ B.spanWith (t, [], []) mempty + A.BibliographyAnchor t _ -> pure $ B.spanWith (t, [], []) mempty + A.CrossReference t Nothing -> + pure $ B.linkWith ("",["cross-reference"],[]) ("#" <> t) "" (B.str t) + A.CrossReference t (Just ils) -> do + B.linkWith ("",["cross-reference"],[]) ("#" <> t) "" <$> doInlines ils + A.AttributeReference (A.AttributeName t) -> -- if this is here, it's unresolved + pure $ B.str ("{" <> t <> "}") + A.Span ils -> B.spanWith B.nullAttr <$> doInlines ils + A.IndexEntry (A.TermInText t) -> + pure $ B.spanWith ("",["index"],[("term",t)]) (B.text t) + A.IndexEntry (A.TermConcealed ts) -> + pure $ B.spanWith ("",["index"],[("term",T.intercalate "," ts)]) mempty + A.Counter name ctype val -> + pure $ B.spanWith ("",["counter"],[("name",name)]) $ B.str $ + case ctype of + A.DecimalCounter -> tshow val + A.UpperAlphaCounter -> T.singleton $ chr (ord 'A' + val - 1) + A.LowerAlphaCounter -> T.singleton $ chr (ord 'a' + val - 1) + -- Passthrough is hard to get right, because pandoc's RawInline needs + -- a format specifier. Often in asciidoc passthrough is used as a form + -- of escaping, so the best approach seems to be treating it as HTML + -- and parsing it: + A.Passthrough t -> do + case runPure (readHtml def{ + readerExtensions = extensionsFromList [Ext_raw_html] + } t) of + Left _ -> pure $ B.rawInline "html" t + Right (Pandoc _ bs) -> pure $ B.fromList . blocksToInlines $ bs + +monospaceStr :: Inline -> Inline +monospaceStr (Str t) = Code B.nullAttr t +monospaceStr x = x |
