diff options
| author | massifrg <[email protected]> | 2025-06-16 01:14:36 +0200 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-07-26 22:45:11 -0700 |
| commit | fe3684632b543b3d8a6b23de9da42dd87daedde7 (patch) | |
| tree | 279ea56f1c2d06c8e9db2e1b144013ac0baa8ec4 /src | |
| parent | cf11339c1d3add1c4d758d0107d714b5954584fe (diff) | |
New `xml` format exactly representing a Pandoc AST.
This adds a reader and writer for an XML format equivalent to `native`
and `json`.
XML schemas for validation can be found in `tools/pandoc-xml.*`.
The format is documented in `doc/xml.md`.
API changes:
- Add module Text.Pandoc.Readers.XML, exporting `readXML`.
- Add module Text.Pandoc.Writers.XML, exporting `writeXML`.
A new unexported module Text.Pandoc.XMLFormat is also added.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/XML.hs | 540 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/XML.hs | 365 | ||||
| -rw-r--r-- | src/Text/Pandoc/XMLFormat.hs | 188 |
5 files changed, 1099 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 8f4800efb..12d1c6c95 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -65,6 +65,7 @@ module Text.Pandoc.Readers , readTypst , readDjot , readPod + , readXML -- * Miscellaneous , getReader , getDefaultExtensions @@ -118,6 +119,7 @@ import Text.Pandoc.Readers.RIS import Text.Pandoc.Readers.RTF import Text.Pandoc.Readers.Typst import Text.Pandoc.Readers.Djot +import Text.Pandoc.Readers.XML import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Sources (ToSources(..), sourcesToText) @@ -174,6 +176,7 @@ readers = [("native" , TextReader readNative) ,("djot" , TextReader readDjot) ,("mdoc" , TextReader readMdoc) ,("pod" , TextReader readPod) + ,("xml" , TextReader readXML) ] -- | Retrieve reader, extensions based on format spec (format+extensions). diff --git a/src/Text/Pandoc/Readers/XML.hs b/src/Text/Pandoc/Readers/XML.hs new file mode 100644 index 000000000..e3173f4c9 --- /dev/null +++ b/src/Text/Pandoc/Readers/XML.hs @@ -0,0 +1,540 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- | +-- Module : Text.Pandoc.Readers.XML +-- Copyright : Copyright (C) 2025- Massimiliano Farinella and John MacFarlane +-- License : GNU GPL, version 2 or above +-- +-- Maintainer : Massimiliano Farinella <[email protected]> +-- Stability : WIP +-- Portability : portable +-- +-- Conversion of (Pandoc specific) xml to 'Pandoc' document. +module Text.Pandoc.Readers.XML (readXML) where + +import Control.Monad (msum) +import Control.Monad.Except (throwError) +import Control.Monad.State.Strict (StateT (runStateT), modify) +import Data.Char (isSpace) +import Data.Default (Default (..)) +import qualified Data.List as L +import qualified Data.Map as M +import Data.Maybe (catMaybes, fromMaybe, mapMaybe) +import qualified Data.Set as S (Set, fromList, member) +import Data.Text (Text) +import qualified Data.Text as T +import Data.Text.Lazy (fromStrict) +import Data.Version (Version, makeVersion) +import Text.Pandoc.Builder +import Text.Pandoc.Class.PandocMonad +import Text.Pandoc.Error (PandocError (..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing (ToSources, toSources) +import Text.Pandoc.Sources (sourcesToText) +import Text.Pandoc.Version (pandocVersion) +import Text.Pandoc.XML (lookupEntity) +import Text.Pandoc.XML.Light +import Text.Pandoc.XMLFormat +import Text.Read (readMaybe) + +-- TODO: use xmlPath state to give better context when an error occurs + +type XMLReader m = StateT XMLReaderState m + +data XMLReaderState = XMLReaderState + { xmlApiVersion :: Version, + xmlMeta :: Meta, + xmlContent :: [Content], + xmlPath :: [Text] + } + deriving (Show) + +instance Default XMLReaderState where + def = + XMLReaderState + { xmlApiVersion = pandocVersion, + xmlMeta = mempty, + xmlContent = [], + xmlPath = ["root"] + } + +readXML :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc +readXML _ inp = do + let sources = toSources inp + tree <- + either (throwError . PandocXMLError "") return $ + parseXMLContents (fromStrict . sourcesToText $ sources) + (bs, st') <- flip runStateT (def {xmlContent = tree}) $ mapM parseBlock tree + let blockList = toList $ concatMany bs + return $ Pandoc (xmlMeta st') blockList + +concatMany :: [Many a] -> Many a +concatMany = Many . mconcat . map unMany + +parseBlocks :: (PandocMonad m) => [Content] -> XMLReader m Blocks +parseBlocks contents = concatMany <$> mapM parseBlock contents + +getBlocks :: (PandocMonad m) => Element -> XMLReader m Blocks +getBlocks e = parseBlocks (elContent e) + +elementName :: Element -> Text +elementName e = qName $ elName e + +attrValue :: Text -> Element -> Text +attrValue attr = + fromMaybe "" . maybeAttrValue attr + +maybeAttrValue :: Text -> Element -> Maybe Text +maybeAttrValue attr elt = + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + +parseBlock :: (PandocMonad m) => Content -> XMLReader m Blocks +parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE +parseBlock (Text (CData _ s _)) = + if T.all isSpace s + then return mempty + else do + throwError $ PandocXMLError "" "non-space characters out of inline context" +parseBlock (CRef x) = do + throwError $ PandocXMLError "" ("reference \"" <> x <> "\" out of inline context") +parseBlock (Elem e) = do + let name = elementName e + in case (name) of + "Pandoc" -> parsePandoc + "?xml" -> return mempty + "blocks" -> getBlocks e + "meta" -> + let entry_els = childrenNamed tgNameMetaMapEntry e + in do + entries <- catMaybes <$> mapM parseMetaMapEntry entry_els + mapM_ (uncurry addMeta) entries + return mempty + "Para" -> para <$> getInlines (elContent e) + "Plain" -> do + ils <- getInlines (elContent e) + return $ singleton . Plain . toList $ ils + "Header" -> (headerWith attr level) <$> getInlines (elContent e) + where + level = textToInt (attrValue atNameLevel e) 1 + attr = filterAttrAttributes [atNameLevel] $ attrFromElement e + "HorizontalRule" -> return horizontalRule + "BlockQuote" -> do + contents <- getBlocks e + return $ blockQuote contents + "Div" -> do + contents <- getBlocks e + return $ divWith (attrFromElement e) contents + "BulletList" -> do + items <- getListItems e + return $ bulletList items + "OrderedList" -> do + items <- getListItems e + return $ orderedListWith (getListAttributes e) items + "DefinitionList" -> do + let items_contents = getContentsOfElements (isElementNamed tgNameDefListItem) (elContent e) + items <- mapM parseDefinitionListItem items_contents + return $ definitionList items + "Figure" -> do + let attr = attrFromElement e + (maybe_caption_el, contents) = partitionFirstChildNamed "Caption" $ elContent e + figure_caption <- case (maybe_caption_el) of + Just (caption_el) -> parseCaption $ elContent caption_el + Nothing -> pure emptyCaption + blocks <- parseBlocks contents + return $ figureWith attr figure_caption blocks + "CodeBlock" -> do + let attr = attrFromElement e + return $ codeBlockWith attr $ strContentRecursive e + "RawBlock" -> do + let format = (attrValue atNameFormat e) + return $ rawBlock format $ strContentRecursive e + "LineBlock" -> do + lins <- mapM getInlines (contentsOfChildren tgNameLineItem (elContent e)) + return $ lineBlock lins + "Table" -> do + -- TODO: check unexpected items + let attr = attrFromElement e + (maybe_caption_el, after_caption) = partitionFirstChildNamed "Caption" $ elContent e + children = elementsWithNames (S.fromList [tgNameColspecs, "TableHead", "TableBody", "TableFoot"]) after_caption + is_element tag el = tag == elementName el + colspecs <- getColspecs $ L.find (is_element tgNameColspecs) children + tbs <- getTableBodies $ filter (is_element "TableBody") children + th <- getTableHead $ L.find (is_element "TableHead") children + tf <- getTableFoot $ L.find (is_element "TableFoot") children + capt <- parseMaybeCaptionElement maybe_caption_el + case colspecs of + Nothing -> return mempty + Just cs -> return $ fromList [Table attr capt cs th tbs tf] + _ -> do + throwError $ PandocXMLError "" ("unexpected element \"" <> name <> "\" in blocks context") + where + parsePandoc = do + let version = maybeAttrValue atNameApiVersion e + apiversion = case (version) of + Just (v) -> makeVersion $ map (read . T.unpack) $ T.splitOn "," v + Nothing -> pandocVersion + in modify $ \st -> st {xmlApiVersion = apiversion} + getBlocks e + +getListItems :: (PandocMonad m) => Element -> XMLReader m [Blocks] +getListItems e = + let items_els = childrenNamed tgNameListItem e + in do + mapM getBlocks items_els + +getContentsOfElements :: (Content -> Bool) -> [Content] -> [[Content]] +getContentsOfElements filter_element contents = mapMaybe element_contents $ filter filter_element contents + where + element_contents :: Content -> Maybe [Content] + element_contents c = case (c) of + Elem e -> Just (elContent e) + _ -> Nothing + +strContentRecursive :: Element -> Text +strContentRecursive = + strContent + . (\e' -> e' {elContent = map elementToStr $ elContent e'}) + +elementToStr :: Content -> Content +elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing +elementToStr x = x + +textToInt :: Text -> Int -> Int +textToInt t deflt = + let safe_to_int :: Text -> Maybe Int + safe_to_int s = readMaybe $ T.unpack s + in case (safe_to_int t) of + Nothing -> deflt + Just (n) -> n + +parseInline :: (PandocMonad m) => Content -> XMLReader m Inlines +parseInline (Text (CData _ s _)) = + return $ text s +parseInline (CRef ref) = + return $ + maybe (text $ T.toUpper ref) text $ + lookupEntity ref +parseInline (Elem e) = + let name = elementName e + in case (name) of + "Space" -> + let count = textToInt (attrValue atNameSpaceCount e) 1 + in return $ fromList $ replicate count Space + "Str" -> return $ fromList [Str $ attrValue atNameStrContent e] + "Emph" -> innerInlines emph + "Strong" -> innerInlines strong + "Strikeout" -> innerInlines strikeout + "Subscript" -> innerInlines subscript + "Superscript" -> innerInlines superscript + "Underline" -> innerInlines underline + "SoftBreak" -> return softbreak + "LineBreak" -> return linebreak + "SmallCaps" -> innerInlines smallcaps + "Quoted" -> case (attrValue atNameQuoteType e) of + "SingleQuote" -> innerInlines singleQuoted + _ -> innerInlines doubleQuoted + "Math" -> case (attrValue atNameMathType e) of + "DisplayMath" -> pure $ displayMath $ strContentRecursive e + _ -> pure $ math $ strContentRecursive e + "Span" -> innerInlines $ spanWith (attrFromElement e) + "Code" -> do + let attr = attrFromElement e + return $ codeWith attr $ strContentRecursive e + "Link" -> innerInlines $ linkWith attr url title + where + url = attrValue atNameLinkUrl e + title = attrValue atNameTitle e + attr = filterAttrAttributes [atNameLinkUrl, atNameTitle] $ attrFromElement e + "Image" -> innerInlines $ imageWith attr url title + where + url = attrValue atNameImageUrl e + title = attrValue atNameTitle e + attr = filterAttrAttributes [atNameImageUrl, atNameTitle] $ attrFromElement e + "RawInline" -> do + let format = (attrValue atNameFormat e) + return $ rawInline format $ strContentRecursive e + "Note" -> do + contents <- getBlocks e + return $ note contents + "Cite" -> + let (maybe_citations_el, contents) = partitionFirstChildNamed tgNameCitations $ elContent e + in case (maybe_citations_el) of + Just citations_el -> do + citations <- parseCitations $ elContent citations_el + (innerInlines' contents) $ cite citations + Nothing -> getInlines contents + _ -> do + throwError $ PandocXMLError "" ("unexpected element \"" <> name <> "\" in inline context") + where + innerInlines' contents f = + f . concatMany + <$> mapM parseInline contents + innerInlines f = innerInlines' (elContent e) f + +getInlines :: (PandocMonad m) => [Content] -> XMLReader m Inlines +getInlines contents = concatMany <$> mapM parseInline contents + +getListAttributes :: Element -> ListAttributes +getListAttributes e = (start, style, delim) + where + start = textToInt (attrValue atNameStart e) 1 + style = case (attrValue atNameNumberStyle e) of + "Example" -> Example + "Decimal" -> Decimal + "LowerRoman" -> LowerRoman + "UpperRoman" -> UpperRoman + "LowerAlpha" -> LowerAlpha + "UpperAlpha" -> UpperAlpha + _ -> DefaultStyle + delim = case (attrValue atNameNumberDelim e) of + "Period" -> Period + "OneParen" -> OneParen + "TwoParens" -> TwoParens + _ -> DefaultDelim + +contentsOfChildren :: Text -> [Content] -> [[Content]] +contentsOfChildren tag contents = mapMaybe childrenElementWithTag contents + where + childrenElementWithTag :: Content -> Maybe [Content] + childrenElementWithTag c = case (c) of + (Elem e) -> if tag == elementName e then Just (elContent e) else Nothing + _ -> Nothing + +alignmentFromText :: Text -> Alignment +alignmentFromText t = case t of + "AlignLeft" -> AlignLeft + "AlignRight" -> AlignRight + "AlignCenter" -> AlignCenter + _ -> AlignDefault + +getColWidth :: Text -> ColWidth +getColWidth txt = case reads (T.unpack txt) of + [(value, "")] -> if value == 0.0 then ColWidthDefault else ColWidth value + _ -> ColWidthDefault + +getColspecs :: (PandocMonad m) => Maybe Element -> XMLReader m (Maybe [ColSpec]) +getColspecs Nothing = pure Nothing +getColspecs (Just cs) = do + return $ Just $ map elementToColSpec (childrenNamed "ColSpec" cs) + where + elementToColSpec e = (alignmentFromText $ attrValue atNameAlignment e, getColWidth $ attrValue atNameColWidth e) + +getTableBody :: (PandocMonad m) => Element -> XMLReader m (Maybe TableBody) +getTableBody body_el = do + let attr = filterAttrAttributes [atNameRowHeadColumns] $ attrFromElement body_el + bh = childrenNamed tgNameBodyHeader body_el + bb = childrenNamed tgNameBodyBody body_el + headcols = textToInt (attrValue atNameRowHeadColumns body_el) 0 + hrows <- mconcat <$> mapM getRows bh + brows <- mconcat <$> mapM getRows bb + return $ Just $ TableBody attr (RowHeadColumns headcols) hrows brows + +getTableBodies :: (PandocMonad m) => [Element] -> XMLReader m [TableBody] +getTableBodies body_elements = do + catMaybes <$> mapM getTableBody body_elements + +getTableHead :: (PandocMonad m) => Maybe Element -> XMLReader m TableHead +getTableHead maybe_e = case maybe_e of + Just e -> do + let attr = attrFromElement e + rows <- getRows e + return $ TableHead attr rows + Nothing -> return $ TableHead nullAttr [] + +getTableFoot :: (PandocMonad m) => Maybe Element -> XMLReader m TableFoot +getTableFoot maybe_e = case maybe_e of + Just e -> do + let attr = attrFromElement e + rows <- getRows e + return $ TableFoot attr rows + Nothing -> return $ TableFoot nullAttr [] + +getCell :: (PandocMonad m) => Element -> XMLReader m Cell +getCell c = do + let alignment = alignmentFromText $ attrValue atNameAlignment c + rowspan = RowSpan $ textToInt (attrValue atNameRowspan c) 1 + colspan = ColSpan $ textToInt (attrValue atNameColspan c) 1 + attr = filterAttrAttributes [atNameAlignment, atNameRowspan, atNameColspan] $ attrFromElement c + blocks <- getBlocks c + return $ Cell attr alignment rowspan colspan (toList blocks) + +getRows :: (PandocMonad m) => Element -> XMLReader m [Row] +getRows e = mapM getRow $ childrenNamed "Row" e + where + getRow r = do + cells <- mapM getCell (childrenNamed "Cell" r) + return $ Row (attrFromElement r) cells + +parseCitations :: (PandocMonad m) => [Content] -> XMLReader m [Citation] +parseCitations contents = do + maybecitations <- mapM getCitation contents + return $ catMaybes maybecitations + where + getCitation :: (PandocMonad m) => Content -> XMLReader m (Maybe Citation) + getCitation content = case (content) of + (Elem e) -> + if qName (elName e) == "Citation" + then do + p <- inlinesOfChildrenNamed tgNameCitationPrefix e + s <- inlinesOfChildrenNamed tgNameCitationSuffix e + return $ + Just + ( Citation + { citationId = attrValue "id" e, + citationPrefix = toList p, + citationSuffix = toList s, + citationMode = case (attrValue atNameCitationMode e) of + "AuthorInText" -> AuthorInText + "SuppressAuthor" -> SuppressAuthor + _ -> NormalCitation, + citationNoteNum = textToInt (attrValue atNameCitationNoteNum e) 0, + citationHash = textToInt (attrValue atNameCitationHash e) 0 + } + ) + else do + return Nothing + _ -> do + return Nothing + where + inlinesOfChildrenNamed tag e = getInlines $ concatMap (\e' -> elContent e') (childrenNamed tag e) + +parseMaybeCaptionElement :: (PandocMonad m) => Maybe Element -> XMLReader m Caption +parseMaybeCaptionElement Nothing = pure emptyCaption +parseMaybeCaptionElement (Just e) = parseCaption $ elContent e + +parseCaption :: (PandocMonad m) => [Content] -> XMLReader m Caption +parseCaption contents = + let (maybe_shortcaption_el, caption_contents) = partitionFirstChildNamed tgNameShortCaption contents + in do + blocks <- parseBlocks caption_contents + case (maybe_shortcaption_el) of + Just shortcaption_el -> do + short_caption <- getInlines (elContent shortcaption_el) + return $ caption (Just $ toList short_caption) blocks + Nothing -> return $ caption Nothing blocks + +parseDefinitionListItem :: (PandocMonad m) => [Content] -> XMLReader m (Inlines, [Blocks]) +parseDefinitionListItem contents = do + let term_contents = getContentsOfElements (isElementNamed tgNameDefListTerm) contents + defs_elements = elementContents $ filter (isElementNamed tgNameDefListDef) contents + term_inlines <- getInlines (concat term_contents) + defs <- mapM getBlocks defs_elements + return (term_inlines, defs) + +elementContents :: [Content] -> [Element] +elementContents contents = mapMaybe toElement contents + where + toElement :: Content -> Maybe Element + toElement (Elem e) = Just e + toElement _ = Nothing + +isElementNamed :: Text -> Content -> Bool +isElementNamed t c = case (c) of + Elem e -> t == elementName e + _ -> False + +childrenNamed :: Text -> Element -> [Element] +childrenNamed tag e = elementContents $ filter (isElementNamed tag) (elContent e) + +elementsWithNames :: S.Set Text -> [Content] -> [Element] +elementsWithNames tags contents = mapMaybe isElementWithNameInSet contents + where + isElementWithNameInSet c = case (c) of + Elem el -> + if (elementName el) `S.member` tags + then Just el + else Nothing + _ -> Nothing + +partitionFirstChildNamed :: Text -> [Content] -> (Maybe Element, [Content]) +partitionFirstChildNamed tag contents = case (contents) of + (Text (CData _ s _) : rest) -> + if T.all isSpace s + then partitionFirstChildNamed tag rest + else (Nothing, contents) + (Elem e : rest) -> + if tag == elementName e + then (Just e, rest) + else (Nothing, contents) + _ -> (Nothing, contents) + +type PandocAttr = (Text, [Text], [(Text, Text)]) + +filterAttributes :: S.Set Text -> [(Text, Text)] -> [(Text, Text)] +filterAttributes to_be_removed a = filter keep_attr a + where + keep_attr (k, _) = not (k `S.member` to_be_removed) + +filterAttrAttributes :: [Text] -> PandocAttr -> PandocAttr +filterAttrAttributes to_be_removed (idn, classes, a) = (idn, classes, filtered) + where + filtered = filterAttributes (S.fromList to_be_removed) a + +attrFromElement :: Element -> PandocAttr +attrFromElement e = filterAttrAttributes ["id", "class"] (idn, classes, attributes) + where + idn = attrValue "id" e + classes = T.words $ attrValue "class" e + attributes = map (\a -> (qName $ attrKey a, attrVal a)) $ elAttribs e + +addMeta :: (PandocMonad m) => (ToMetaValue a) => Text -> a -> XMLReader m () +addMeta field val = modify (setMeta field val) + +instance HasMeta XMLReaderState where + setMeta field v s = s {xmlMeta = setMeta field v (xmlMeta s)} + + deleteMeta field s = s {xmlMeta = deleteMeta field (xmlMeta s)} + +parseMetaMapEntry :: (PandocMonad m) => Element -> XMLReader m (Maybe (Text, MetaValue)) +parseMetaMapEntry e = + let key = attrValue atNameMetaMapEntryKey e + in case (key) of + "" -> pure Nothing + k -> do + maybe_value <- parseMetaMapEntryContents $ elContent e + case (maybe_value) of + Nothing -> return Nothing + Just v -> return $ Just (k, v) + +parseMetaMapEntryContents :: (PandocMonad m) => [Content] -> XMLReader m (Maybe MetaValue) +parseMetaMapEntryContents cs = msum <$> mapM parseMeta cs + +parseMeta :: (PandocMonad m) => Content -> XMLReader m (Maybe MetaValue) +parseMeta (Text (CData CDataRaw _ _)) = return Nothing +parseMeta (Text (CData _ s _)) = + if T.all isSpace s + then return Nothing + else do + throwError $ PandocXMLError "" "non-space characters out of inline context in metadata" +parseMeta (CRef x) = + throwError $ PandocXMLError "" ("reference \"" <> x <> "\" out of inline context") +parseMeta (Elem e) = do + let name = elementName e + in case (name) of + "MetaBool" -> case (attrValue atNameMetaBoolValue e) of + "true" -> return $ Just $ MetaBool True + _ -> return $ Just $ MetaBool False + "MetaString" -> pure Nothing + "MetaInlines" -> do + inlines <- getInlines (elContent e) + return $ Just $ MetaInlines $ toList inlines + "MetaBlocks" -> do + blocks <- getBlocks e + return $ Just $ MetaBlocks $ toList blocks + "MetaList" -> do + maybe_items <- mapM parseMeta $ elContent e + let items = catMaybes maybe_items + in -- TODO: report empty MetaList? + return $ Just $ MetaList items + "MetaMap" -> + let entry_els = childrenNamed tgNameMetaMapEntry e + in do + entries <- catMaybes <$> mapM parseMetaMapEntry entry_els + if null entries + then + -- TODO: report empty MetaMap + return Nothing + else return $ Just $ MetaMap $ M.fromList entries + _ -> do + throwError $ PandocXMLError "" ("unexpected element \"" <> name <> "\" in metadata") diff --git a/src/Text/Pandoc/Writers.hs b/src/Text/Pandoc/Writers.hs index 3e1974ad6..164bf6d0f 100644 --- a/src/Text/Pandoc/Writers.hs +++ b/src/Text/Pandoc/Writers.hs @@ -76,6 +76,7 @@ module Text.Pandoc.Writers , writeTexinfo , writeTextile , writeTypst + , writeXML , writeXWiki , writeZimWiki , getWriter @@ -128,6 +129,7 @@ import Text.Pandoc.Writers.TEI import Text.Pandoc.Writers.Texinfo import Text.Pandoc.Writers.Textile import Text.Pandoc.Writers.Typst +import Text.Pandoc.Writers.XML import Text.Pandoc.Writers.XWiki import Text.Pandoc.Writers.ZimWiki @@ -203,6 +205,7 @@ writers = [ ,("chunkedhtml" , ByteStringWriter writeChunkedHTML) ,("djot" , TextWriter writeDjot) ,("ansi" , TextWriter writeANSI) + ,("xml" , TextWriter writeXML) ] -- | Retrieve writer, extensions based on formatSpec (format+extensions). diff --git a/src/Text/Pandoc/Writers/XML.hs b/src/Text/Pandoc/Writers/XML.hs new file mode 100644 index 000000000..35bcb07f8 --- /dev/null +++ b/src/Text/Pandoc/Writers/XML.hs @@ -0,0 +1,365 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- | +-- Module : Text.Pandoc.Writers.XML +-- Copyright : Copyright (C) 2025- Massimiliano Farinella and John MacFarlane +-- License : GNU GPL, version 2 or above +-- +-- Maintainer : Massimiliano Farinella <[email protected]> +-- Stability : WIP +-- Portability : portable +-- +-- Conversion of 'Pandoc' documents to (pandoc specific) xml markup. +module Text.Pandoc.Writers.XML (writeXML) where + +import Data.Map (Map, toList) +import Data.Maybe (mapMaybe) +import qualified Data.Text as T +import Data.Version (versionBranch) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Options (WriterOptions (..)) +import Text.Pandoc.XML.Light +import qualified Text.Pandoc.XML.Light as XML +import Text.Pandoc.XMLFormat +import Text.XML.Light (xml_header) + +type PandocAttr = Text.Pandoc.Definition.Attr + +writeXML :: (PandocMonad m) => WriterOptions -> Pandoc -> m T.Text +writeXML _ doc = do + return $ pandocToXmlText doc + +text_node :: T.Text -> Content +text_node text = Text (CData CDataText text Nothing) + +emptyElement :: T.Text -> Element +emptyElement tag = + Element + { elName = unqual tag, + elAttribs = [], + elContent = [], + elLine = Nothing + } + +elementWithContents :: T.Text -> [Content] -> Element +elementWithContents tag contents = + Element + { elName = unqual tag, + elAttribs = [], + elContent = contents, + elLine = Nothing + } + +elementWithAttributes :: T.Text -> [XML.Attr] -> Element +elementWithAttributes tag attributes = + Element + { elName = unqual tag, + elAttribs = attributes, + elContent = [], + elLine = Nothing + } + +elementWithAttrAndContents :: T.Text -> PandocAttr -> [Content] -> Element +elementWithAttrAndContents tag attr contents = addAttrAttributes attr $ elementWithContents tag contents + +asBlockOfInlines :: Element -> [Content] +asBlockOfInlines el = [Elem el, text_node "\n"] + +asBlockOfBlocks :: Element -> [Content] +asBlockOfBlocks el = [Elem newline_before_first, newline] + where + newline = text_node "\n" + newline_before_first = if null (elContent el) then el else prependContents [newline] el + +itemName :: (Show a) => a -> T.Text +itemName a = T.pack $ takeWhile (/= ' ') (show a) + +intAsText :: Int -> T.Text +intAsText i = T.pack $ show i + +itemAsEmptyElement :: (Show a) => a -> Element +itemAsEmptyElement item = emptyElement $ itemName item + +pandocToXmlText :: Pandoc -> T.Text +pandocToXmlText (Pandoc (Meta meta) blocks) = with_header . with_blocks . with_meta . with_version $ el + where + el = prependContents [text_node "\n"] $ emptyElement "Pandoc" + with_version = addAttribute atNameApiVersion (T.intercalate "," $ map (T.pack . show) $ versionBranch pandocTypesVersion) + with_meta = appendContents (metaMapToXML meta "meta") + with_blocks = appendContents (asBlockOfBlocks $ elementWithContents "blocks" $ blocksToXML blocks) + with_header :: Element -> T.Text + with_header e = T.concat [T.pack xml_header, "\n", showElement e] + +metaMapToXML :: Map T.Text MetaValue -> T.Text -> [Content] +metaMapToXML mmap tag = asBlockOfBlocks $ elementWithContents tag entries + where + entries = concatMap to_entry $ toList mmap + to_entry :: (T.Text, MetaValue) -> [Content] + to_entry (text, metavalue) = asBlockOfBlocks with_key + where + entry = elementWithContents tgNameMetaMapEntry $ metaValueToXML metavalue + with_key = addAttribute atNameMetaMapEntryKey text entry + +metaValueToXML :: MetaValue -> [Content] +metaValueToXML value = + let name = itemName value + el = itemAsEmptyElement value + in case (value) of + MetaBool b -> asBlockOfInlines $ addAttribute atNameMetaBoolValue bool_value el + where + bool_value = if b then "true" else "false" + MetaString s -> asBlockOfInlines $ appendContents [text_node s] el + MetaInlines inlines -> asBlockOfInlines $ appendContents (inlinesToXML inlines) el + MetaBlocks blocks -> asBlockOfBlocks $ appendContents (blocksToXML blocks) el + MetaList items -> asBlockOfBlocks $ appendContents (concatMap metaValueToXML items) el + MetaMap mm -> metaMapToXML mm name + +blocksToXML :: [Block] -> [Content] +blocksToXML blocks = concatMap blockToXML blocks + +inlinesToXML :: [Inline] -> [Content] +inlinesToXML inlines = concatMap inlineContentToContents (ilsToIlsContent inlines []) + +data InlineContent + = NormalInline Inline + | ElSpace Int + | ElStr T.Text + +ilsToIlsContent :: [Inline] -> [InlineContent] -> [InlineContent] +ilsToIlsContent (Space : xs) [] = ilsToIlsContent xs [ElSpace 1] +ilsToIlsContent (Space : xs) (NormalInline Space : cs) = ilsToIlsContent xs (ElSpace 2 : cs) +ilsToIlsContent (Space : xs) (ElSpace n : cs) = ilsToIlsContent xs (ElSpace (n + 1) : cs) +-- empty Str are always encoded as <Str /> +ilsToIlsContent (Str "" : xs) ilct = ilsToIlsContent xs (ElStr "" : ilct) +-- Str s1, Str s2 -> s1<Str content="s2"> +ilsToIlsContent (Str s2 : xs) (NormalInline str1@(Str _) : ilct) = ilsToIlsContent xs (ElStr s2 : NormalInline str1 : ilct) +-- +ilsToIlsContent (Str s : xs) ilct = + if T.any (== ' ') s + then ilsToIlsContent xs (ElStr s : ilct) + else ilsToIlsContent xs (NormalInline (Str s) : ilct) +ilsToIlsContent (x : xs) ilct = ilsToIlsContent xs (NormalInline x : ilct) +ilsToIlsContent [] ilct = reverse $ lastSpaceAsElem ilct + where + lastSpaceAsElem :: [InlineContent] -> [InlineContent] + lastSpaceAsElem (NormalInline Space : xs) = ElSpace 1 : xs + lastSpaceAsElem ilcts = ilcts + +inlineContentToContents :: InlineContent -> [Content] +inlineContentToContents (NormalInline il) = inlineToXML il +inlineContentToContents (ElSpace 1) = [Elem $ emptyElement "Space"] +inlineContentToContents (ElSpace n) = [Elem $ addAttribute atNameSpaceCount (intAsText n) (emptyElement "Space")] +inlineContentToContents (ElStr "") = [Elem $ emptyElement "Str"] +inlineContentToContents (ElStr s) = [Elem $ addAttribute atNameStrContent s (emptyElement "Str")] + +asContents :: Element -> [Content] +asContents el = [Elem el] + +wrapBlocks :: T.Text -> [Block] -> [Content] +wrapBlocks tag blocks = asBlockOfBlocks $ elementWithContents tag $ blocksToXML blocks + +wrapArrayOfBlocks :: T.Text -> [[Block]] -> [Content] +wrapArrayOfBlocks tag array = concatMap (wrapBlocks tag) array + +-- wrapInlines :: T.Text -> [Inline] -> [Content] +-- wrapInlines tag inlines = asBlockOfInlines $ element_with_contents tag $ inlinesToXML inlines + +blockToXML :: Block -> [Content] +blockToXML block = + let el = itemAsEmptyElement block + in case (block) of + Para inlines -> asBlockOfInlines $ appendContents (inlinesToXML inlines) el + Header level (idn, cls, attrs) inlines -> asBlockOfInlines $ appendContents (inlinesToXML inlines) with_attr + where + with_attr = addAttrAttributes (idn, cls, attrs ++ [(atNameLevel, intAsText level)]) el + Plain inlines -> asBlockOfInlines $ appendContents (inlinesToXML inlines) el + Div attr blocks -> asBlockOfBlocks $ appendContents (blocksToXML blocks) with_attr + where + with_attr = addAttrAttributes attr el + BulletList items -> asBlockOfBlocks $ appendContents (wrapArrayOfBlocks tgNameListItem items) el + OrderedList (start, style, delim) items -> asBlockOfBlocks $ with_contents . with_attrs $ el + where + with_attrs = + addAttributes + ( validAttributes + [ (atNameStart, intAsText start), + (atNameNumberStyle, itemName style), + (atNameNumberDelim, itemName delim) + ] + ) + with_contents = appendContents (wrapArrayOfBlocks tgNameListItem items) + BlockQuote blocks -> asBlockOfBlocks $ appendContents (blocksToXML blocks) el + HorizontalRule -> asBlockOfInlines el + CodeBlock attr text -> asBlockOfInlines $ with_contents . with_attr $ el + where + with_contents = appendContents [text_node text] + with_attr = addAttrAttributes attr + LineBlock lins -> asBlockOfBlocks $ appendContents (concatMap wrapInlines lins) el + where + wrapInlines inlines = asContents $ appendContents (inlinesToXML inlines) $ emptyElement tgNameLineItem + Table attr caption colspecs thead tbodies tfoot -> asBlockOfBlocks $ with_foot . with_bodies . with_head . with_colspecs . with_caption . with_attr $ el + where + with_attr = addAttrAttributes attr + with_caption = appendContents (captionToXML caption) + with_colspecs = appendContents (colSpecsToXML colspecs) + with_head = appendContents (tableHeadToXML thead) + with_bodies = appendContents (concatMap tableBodyToXML tbodies) + with_foot = appendContents (tableFootToXML tfoot) + Figure attr caption blocks -> asBlockOfBlocks $ with_contents . with_caption . with_attr $ el + where + with_attr = addAttrAttributes attr + with_caption = appendContents (captionToXML caption) + with_contents = appendContents (blocksToXML blocks) + RawBlock (Format format) text -> asContents $ appendContents [text_node text] raw + where + raw = addAttribute atNameFormat format el + DefinitionList items -> asBlockOfBlocks $ appendContents (map definitionListItemToXML items) el + +inlineToXML :: Inline -> [Content] +inlineToXML inline = + let el = itemAsEmptyElement inline + wrapInlines inlines = asContents $ appendContents (inlinesToXML inlines) el + in case (inline) of + Space -> [text_node " "] + Str s -> [text_node s] + Emph inlines -> wrapInlines inlines + Strong inlines -> wrapInlines inlines + Quoted quote_type inlines -> asContents $ appendContents (inlinesToXML inlines) quoted + where + quoted = addAttribute atNameQuoteType (itemName quote_type) el + Underline inlines -> wrapInlines inlines + Strikeout inlines -> wrapInlines inlines + SmallCaps inlines -> wrapInlines inlines + Superscript inlines -> wrapInlines inlines + Subscript inlines -> wrapInlines inlines + SoftBreak -> asContents el + LineBreak -> asContents el + Span attr inlines -> asContents $ appendContents (inlinesToXML inlines) with_attr + where + with_attr = addAttrAttributes attr el + Link (idn, cls, attrs) inlines (url, title) -> asContents $ appendContents (inlinesToXML inlines) with_attr + where + with_attr = addAttrAttributes (idn, cls, attrs ++ [(atNameLinkUrl, url), (atNameTitle, title)]) el + Image (idn, cls, attrs) inlines (url, title) -> asContents $ appendContents (inlinesToXML inlines) with_attr + where + with_attr = addAttrAttributes (idn, cls, attrs ++ [(atNameImageUrl, url), (atNameTitle, title)]) el + RawInline (Format format) text -> asContents $ appendContents [text_node text] raw + where + raw = addAttribute atNameFormat format el + Math math_type text -> asContents $ appendContents [text_node text] math + where + math = addAttribute atNameMathType (itemName math_type) el + Code attr text -> asContents $ appendContents [text_node text] with_attr + where + with_attr = addAttrAttributes attr el + Note blocks -> asContents $ appendContents (blocksToXML blocks) el + Cite citations inlines -> asContents $ appendContents (inlinesToXML inlines) with_citations + where + with_citations = addCitations citations el + +-- TODO: don't let an attribute overwrite id or class +maybeAttribute :: (T.Text, T.Text) -> Maybe XML.Attr +maybeAttribute (_, "") = Nothing +maybeAttribute ("", _) = Nothing +maybeAttribute (name, value) = Just $ XML.Attr (unqual name) value + +validAttributes :: [(T.Text, T.Text)] -> [XML.Attr] +validAttributes pairs = mapMaybe maybeAttribute pairs + +appendContents :: [Content] -> Element -> Element +appendContents newContents el = el {elContent = (elContent el) ++ newContents} + +prependContents :: [Content] -> Element -> Element +prependContents newContents el = el {elContent = newContents ++ (elContent el)} + +addAttributes :: [XML.Attr] -> Element -> Element +addAttributes newAttrs el = el {elAttribs = newAttrs ++ elAttribs el} + +addAttribute :: T.Text -> T.Text -> Element -> Element +addAttribute attr_name attr_value el = el {elAttribs = new_attr : elAttribs el} + where + new_attr = XML.Attr (unqual attr_name) attr_value + +addAttrAttributes :: PandocAttr -> Element -> Element +addAttrAttributes (identifier, classes, attributes) el = addAttributes attrs' el + where + attrs' = mapMaybe maybeAttribute (("id", identifier) : ("class", T.intercalate " " classes) : attributes) + +addCitations :: [Citation] -> Element -> Element +addCitations citations el = appendContents [Elem $ elementWithContents tgNameCitations $ (text_node "\n") : concatMap citation_to_elem citations] el + where + citation_to_elem :: Citation -> [Content] + citation_to_elem citation = asBlockOfInlines with_suffix + where + cit_elem = elementWithAttributes (itemName citation) attrs + prefix = citationPrefix citation + suffix = citationSuffix citation + with_prefix = + if null prefix + then cit_elem + else appendContents [Elem $ elementWithContents tgNameCitationPrefix $ inlinesToXML prefix] cit_elem + with_suffix = + if null suffix + then with_prefix + else appendContents [Elem $ elementWithContents tgNameCitationSuffix $ inlinesToXML suffix] with_prefix + attrs = + map + (\(n, v) -> XML.Attr (unqual n) v) + [ ("id", citationId citation), + (atNameCitationMode, T.pack $ show $ citationMode citation), + (atNameCitationNoteNum, intAsText $ citationNoteNum citation), + (atNameCitationHash, intAsText $ citationHash citation) + ] + +definitionListItemToXML :: ([Inline], [[Block]]) -> Content +definitionListItemToXML (inlines, defs) = Elem $ elementWithContents tgNameDefListItem $ term ++ wrapArrayOfBlocks tgNameDefListDef defs + where + term = asBlockOfInlines $ appendContents (inlinesToXML inlines) $ emptyElement tgNameDefListTerm + +captionToXML :: Caption -> [Content] +captionToXML (Caption short blocks) = asBlockOfBlocks with_short_caption + where + el = elementWithContents "Caption" $ blocksToXML blocks + with_short_caption = case (short) of + Just inlines -> prependContents (asBlockOfInlines $ elementWithContents tgNameShortCaption $ inlinesToXML inlines) el + _ -> el + +colSpecToXML :: (Alignment, ColWidth) -> [Content] +colSpecToXML (align, cw) = asBlockOfInlines colspec + where + colspec = elementWithAttributes "ColSpec" $ validAttributes [(atNameAlignment, itemName align), (atNameColWidth, colwidth)] + colwidth = case (cw) of + ColWidth d -> T.pack $ show d + ColWidthDefault -> "0" + +colSpecsToXML :: [(Alignment, ColWidth)] -> [Content] +colSpecsToXML colspecs = asBlockOfBlocks $ elementWithContents tgNameColspecs $ concatMap colSpecToXML colspecs + +tableHeadToXML :: TableHead -> [Content] +tableHeadToXML (TableHead attr rows) = asBlockOfBlocks $ elementWithAttrAndContents "TableHead" attr $ concatMap rowToXML rows + +tableBodyToXML :: TableBody -> [Content] +tableBodyToXML (TableBody (idn, cls, attrs) (RowHeadColumns headcols) hrows brows) = asBlockOfBlocks $ elementWithAttrAndContents "TableBody" attr children + where + attr = (idn, cls, (atNameRowHeadColumns, intAsText headcols) : attrs) + header_rows = asBlockOfBlocks $ elementWithContents tgNameBodyHeader $ concatMap rowToXML hrows + body_rows = asBlockOfBlocks $ elementWithContents tgNameBodyBody $ concatMap rowToXML brows + children = header_rows ++ body_rows + +tableFootToXML :: TableFoot -> [Content] +tableFootToXML (TableFoot attr rows) = asBlockOfBlocks $ elementWithAttrAndContents "TableFoot" attr $ concatMap rowToXML rows + +rowToXML :: Row -> [Content] +rowToXML (Row attr cells) = asBlockOfBlocks $ elementWithAttrAndContents "Row" attr $ concatMap cellToXML cells + +cellToXML :: Cell -> [Content] +cellToXML (Cell (idn, cls, attrs) alignment (RowSpan rowspan) (ColSpan colspan) blocks) = asBlockOfBlocks $ elementWithAttrAndContents "Cell" attr $ blocksToXML blocks + where + with_alignment a = (atNameAlignment, itemName alignment) : a + with_rowspan a = if rowspan > 1 then (atNameRowspan, intAsText rowspan) : a else a + with_colspan a = if colspan > 1 then (atNameColspan, intAsText colspan) : a else a + attrs' = (with_colspan . with_rowspan . with_alignment) attrs + attr = (idn, cls, attrs') diff --git a/src/Text/Pandoc/XMLFormat.hs b/src/Text/Pandoc/XMLFormat.hs new file mode 100644 index 000000000..6801de90c --- /dev/null +++ b/src/Text/Pandoc/XMLFormat.hs @@ -0,0 +1,188 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Text.Pandoc.XMLFormat + ( atNameAlignment, + atNameApiVersion, + atNameCitationHash, + atNameCitationMode, + atNameCitationNoteNum, + atNameColspan, + atNameColWidth, + atNameFormat, + atNameImageUrl, + atNameLevel, + atNameLinkUrl, + atNameMathType, + atNameMetaBoolValue, + atNameMetaMapEntryKey, + atNameNumberDelim, + atNameNumberStyle, + atNameQuoteType, + atNameRowHeadColumns, + atNameRowspan, + atNameSpaceCount, + atNameStart, + atNameStrContent, + atNameTitle, + tgNameBodyBody, + tgNameBodyHeader, + tgNameCitations, + tgNameCitationPrefix, + tgNameCitationSuffix, + tgNameColspecs, + tgNameDefListDef, + tgNameDefListItem, + tgNameDefListTerm, + tgNameLineItem, + tgNameListItem, + tgNameMetaMapEntry, + tgNameShortCaption, + ) +where + +import Data.Text (Text) + +-- the attribute carrying the API version of pandoc types in the main Pandoc element +atNameApiVersion :: Text +atNameApiVersion = "api-version" + +-- the element of a <meta> or <MetaMap> entry +tgNameMetaMapEntry :: Text +tgNameMetaMapEntry = "entry" + +-- the attribute carrying the key name of a <meta> or <MetaMap> entry +atNameMetaMapEntryKey :: Text +atNameMetaMapEntryKey = "key" + +-- the attribute carrying the boolean value ("true" or "false") of a MetaBool +atNameMetaBoolValue :: Text +atNameMetaBoolValue = "value" + +-- level of a Header +atNameLevel :: Text +atNameLevel = "level" + +-- start number of an OrderedList +atNameStart :: Text +atNameStart = "start" + +-- number delimiter of an OrderedList +atNameNumberDelim :: Text +atNameNumberDelim = "number-delim" + +-- number style of an OrderedList +atNameNumberStyle :: Text +atNameNumberStyle = "number-style" + +-- target title in Image and Link +atNameTitle :: Text +atNameTitle = "title" + +-- target url in Image +atNameImageUrl :: Text +atNameImageUrl = "src" + +-- target url in Link +atNameLinkUrl :: Text +atNameLinkUrl = "href" + +-- QuoteType of a Quoted +atNameQuoteType :: Text +atNameQuoteType = "quote-type" + +-- MathType of a Math +atNameMathType :: Text +atNameMathType = "math-type" + +-- format of a RawInline or a RawBlock +atNameFormat :: Text +atNameFormat = "format" + +-- alignment attribute in a ColSpec or in a Cell +atNameAlignment :: Text +atNameAlignment = "alignment" + +-- ColWidth attribute in a ColSpec +atNameColWidth :: Text +atNameColWidth = "col-width" + +-- RowHeadColumns attribute in a TableBody +atNameRowHeadColumns :: Text +atNameRowHeadColumns = "row-head-columns" + +-- RowSpan attribute in a Cell +atNameRowspan :: Text +atNameRowspan = "row-span" + +-- ColSpan attribute in a Cell +atNameColspan :: Text +atNameColspan = "col-span" + +-- the citationMode of a Citation +atNameCitationMode :: Text +atNameCitationMode = "mode" + +-- the citationHash of a Citation +atNameCitationHash :: Text +atNameCitationHash = "hash" + +-- the citationNoteNum of a Citation +atNameCitationNoteNum :: Text +atNameCitationNoteNum = "note-num" + +-- the number of consecutive spaces of the <Space> element +atNameSpaceCount :: Text +atNameSpaceCount = "count" + +-- the content of the <Str> element +atNameStrContent :: Text +atNameStrContent = "content" + +-- container of Citation elements in Cite inlines +tgNameCitations :: Text +tgNameCitations = "citations" + +-- element around the prefix inlines of a Citation +tgNameCitationPrefix :: Text +tgNameCitationPrefix = "prefix" + +-- element around the suffix inlines of a Citation +tgNameCitationSuffix :: Text +tgNameCitationSuffix = "suffix" + +-- list item for BulletList and OrderedList +tgNameListItem :: Text +tgNameListItem = "item" + +-- list item for DefinitionList +tgNameDefListItem :: Text +tgNameDefListItem = "item" + +-- element around the inlines of the term of a DefinitionList item +tgNameDefListTerm :: Text +tgNameDefListTerm = "term" + +-- element around the blocks of a definition in a DefinitionList item +tgNameDefListDef :: Text +tgNameDefListDef = "def" + +-- optional element of the ShortCaption +tgNameShortCaption :: Text +tgNameShortCaption = "ShortCaption" + +-- element around the ColSpec of a Table +tgNameColspecs :: Text +tgNameColspecs = "colspecs" + +-- element around the header rows of a TableBody +tgNameBodyHeader :: Text +tgNameBodyHeader = "header" + +-- element around the body rows of a TableBody +tgNameBodyBody :: Text +tgNameBodyBody = "body" + +-- element around the inlines of a line in a LineBlock +tgNameLineItem :: Text +tgNameLineItem = "line" |
