aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Antich <[email protected]>2025-11-09 13:48:14 +0100
committerJohn MacFarlane <[email protected]>2025-11-24 22:37:39 +0100
commitec75b693e5618c12ddac872d48e084436f1e1b48 (patch)
tree942edb1022128c5f769b928ee5ba3b0a6ee6b4f7
parent5d3b2916d616902d205146c108629053516fd9f4 (diff)
Support pptx (PowerPoint) as an input format.
New module `Text.Pandoc.Readers.Pptx`, exporting `readPptx`. [API change] Factored out some common OOXML functions from Text.Pandoc.Readers.Docx.Util into a non-exported module Text.Pandoc.Readers.OOXML.Shared.
-rw-r--r--MANUAL.txt1
-rw-r--r--pandoc.cabal9
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/Docx/Util.hs48
-rw-r--r--src/Text/Pandoc/Readers/OOXML/Shared.hs95
-rw-r--r--src/Text/Pandoc/Readers/Pptx.hs43
-rw-r--r--src/Text/Pandoc/Readers/Pptx/Parse.hs216
-rw-r--r--src/Text/Pandoc/Readers/Pptx/Shapes.hs330
-rw-r--r--src/Text/Pandoc/Readers/Pptx/Slides.hs83
-rw-r--r--src/Text/Pandoc/Readers/Pptx/SmartArt.hs220
-rw-r--r--test/Tests/Readers/Pptx.hs63
-rw-r--r--test/pptx-reader/basic.native149
-rw-r--r--test/pptx-reader/basic.pptxbin0 -> 111674 bytes
-rw-r--r--test/test-pandoc.hs2
14 files changed, 1218 insertions, 44 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index 8f9ba8b0e..d394b2f6d 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -273,6 +273,7 @@ header when requesting a document from a URL:
- `opml` ([OPML])
- `org` ([Emacs Org mode])
- `pod` (Perl's [Plain Old Documentation])
+ - `pptx` ([PowerPoint])
- `ris` ([RIS] bibliography)
- `rtf` ([Rich Text Format])
- `rst` ([reStructuredText])
diff --git a/pandoc.cabal b/pandoc.cabal
index a5f21dbb2..f648dce11 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -426,6 +426,8 @@ extra-source-files:
test/pptx/*.pptx
test/pptx/**/*.pptx
test/pptx/**/*.native
+ test/pptx-reader/basic.pptx
+ test/pptx-reader/basic.native
test/ipynb/*.native
test/ipynb/*.in.native
test/ipynb/*.out.native
@@ -610,6 +612,7 @@ library
Text.Pandoc.Readers.TikiWiki,
Text.Pandoc.Readers.Txt2Tags,
Text.Pandoc.Readers.Docx,
+ Text.Pandoc.Readers.Pptx,
Text.Pandoc.Readers.ODT,
Text.Pandoc.Readers.EPUB,
Text.Pandoc.Readers.Muse,
@@ -718,6 +721,11 @@ library
Text.Pandoc.Readers.Docx.Util,
Text.Pandoc.Readers.Docx.Symbols,
Text.Pandoc.Readers.Docx.Fields,
+ Text.Pandoc.Readers.OOXML.Shared,
+ Text.Pandoc.Readers.Pptx.Parse,
+ Text.Pandoc.Readers.Pptx.Shapes,
+ Text.Pandoc.Readers.Pptx.Slides,
+ Text.Pandoc.Readers.Pptx.SmartArt,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.Table,
Text.Pandoc.Readers.HTML.TagCategories,
@@ -854,6 +862,7 @@ test-suite test-pandoc
Tests.Readers.RST
Tests.Readers.RTF
Tests.Readers.Docx
+ Tests.Readers.Pptx
Tests.Readers.ODT
Tests.Readers.Txt2Tags
Tests.Readers.EPUB
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 12d1c6c95..5f7b891e2 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -26,6 +26,7 @@ module Text.Pandoc.Readers
Reader (..)
, readers
, readDocx
+ , readPptx
, readODT
, readMarkdown
, readCommonMark
@@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
+import Text.Pandoc.Readers.Pptx
import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
@@ -157,6 +159,7 @@ readers = [("native" , TextReader readNative)
,("twiki" , TextReader readTWiki)
,("tikiwiki" , TextReader readTikiWiki)
,("docx" , ByteStringReader readDocx)
+ ,("pptx" , ByteStringReader readPptx)
,("odt" , ByteStringReader readODT)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
index 88c1973f9..b8c51534b 100644
--- a/src/Text/Pandoc/Readers/Docx/Util.hs
+++ b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util (
, extractChildren
) where
-import qualified Data.Text as T
-import Data.Text (Text)
-import Text.Pandoc.XML.Light
-import qualified Data.Map as M
import Data.List (partition)
-
-type NameSpaces = M.Map Text Text
-
-elemToNameSpaces :: Element -> NameSpaces
-elemToNameSpaces = foldr (\(Attr qn val) ->
- case qn of
- QName s _ (Just "xmlns") -> M.insert s val
- _ -> id) mempty . elAttribs
-
-elemName :: NameSpaces -> Text -> Text -> QName
-elemName ns prefix name =
- QName name (M.lookup prefix ns)
- (if T.null prefix then Nothing else Just prefix)
-
-isElem :: NameSpaces -> Text -> Text -> Element -> Bool
-isElem ns prefix name element =
- let ns' = ns <> elemToNameSpaces element
- in qName (elName element) == name &&
- qURI (elName element) == M.lookup prefix ns'
-
-findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findChildByName ns pref name el =
- let ns' = ns <> elemToNameSpaces el
- in findChild (elemName ns' pref name) el
-
-findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
-findChildrenByName ns pref name el =
- let ns' = ns <> elemToNameSpaces el
- in findChildren (elemName ns' pref name) el
-
--- | Like 'findChildrenByName', but searches descendants.
-findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findElementByName ns pref name el =
- let ns' = ns <> elemToNameSpaces el
- in findElement (elemName ns' pref name) el
-
-findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
-findAttrByName ns pref name el =
- let ns' = ns <> elemToNameSpaces el
- in findAttr (elemName ns' pref name) el
+import Text.Pandoc.XML.Light
+import Text.Pandoc.Readers.OOXML.Shared
+ (NameSpaces, elemName, isElem, elemToNameSpaces,
+ findChildByName, findChildrenByName, findElementByName, findAttrByName)
-- | Removes child elements that satisfy a given condition.
diff --git a/src/Text/Pandoc/Readers/OOXML/Shared.hs b/src/Text/Pandoc/Readers/OOXML/Shared.hs
new file mode 100644
index 000000000..0aa07a736
--- /dev/null
+++ b/src/Text/Pandoc/Readers/OOXML/Shared.hs
@@ -0,0 +1,95 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.OOXML.Shared
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX).
+Provides common functions for ZIP archive handling, XML parsing,
+namespace management, and DrawingML parsing.
+-}
+module Text.Pandoc.Readers.OOXML.Shared
+ ( -- * Constants
+ emusPerInch
+ , emuToInches
+ , inchesToEmu
+ -- * Types
+ , NameSpaces
+ , elemName
+ , elemToNameSpaces
+ , isElem
+ , findChildByName
+ , findChildrenByName
+ , findElementByName
+ , findAttrByName
+ ) where
+
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Pandoc.XML.Light
+
+-- | Type alias for namespace mappings
+type NameSpaces = M.Map Text Text
+
+-- | English Metric Units per inch
+-- 1 inch = 914400 EMUs (used in OOXML for dimensions)
+emusPerInch :: Integer
+emusPerInch = 914400
+
+-- | Convert EMUs to inches
+emuToInches :: Integer -> Double
+emuToInches n = fromIntegral n / fromIntegral emusPerInch
+
+-- | Convert inches to EMUs
+inchesToEmu :: Double -> Integer
+inchesToEmu n = round (n * fromIntegral emusPerInch)
+
+-- | Extract namespace declarations from element attributes
+elemToNameSpaces :: Element -> NameSpaces
+elemToNameSpaces = foldr (\(Attr qn val) ->
+ case qn of
+ QName s _ (Just "xmlns") -> M.insert s val
+ _ -> id) mempty . elAttribs
+
+-- | Create a qualified name from namespace map, prefix, and local name
+elemName :: NameSpaces -> Text -> Text -> QName
+elemName ns prefix name =
+ QName name
+ (M.lookup prefix ns)
+ (if T.null prefix then Nothing else Just prefix)
+
+-- | Check if element matches namespace prefix and local name
+isElem :: NameSpaces -> Text -> Text -> Element -> Bool
+isElem ns prefix name element =
+ let ns' = ns <> elemToNameSpaces element
+ in qName (elName element) == name &&
+ qURI (elName element) == M.lookup prefix ns'
+
+-- | Find first child element matching namespace and name
+findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findChildByName ns pref name el =
+ let ns' = ns <> elemToNameSpaces el
+ in findChild (elemName ns' pref name) el
+
+-- | Find all children matching namespace and name
+findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
+findChildrenByName ns pref name el =
+ let ns' = ns <> elemToNameSpaces el
+ in findChildren (elemName ns' pref name) el
+
+-- | Find element anywhere in descendants matching namespace and name
+findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findElementByName ns pref name el =
+ let ns' = ns <> elemToNameSpaces el
+ in findElement (elemName ns' pref name) el
+
+-- | Find attribute value by namespace prefix and name
+findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
+findAttrByName ns pref name el =
+ let ns' = ns <> elemToNameSpaces el
+ in findAttr (elemName ns' pref name) el
diff --git a/src/Text/Pandoc/Readers/Pptx.hs b/src/Text/Pandoc/Readers/Pptx.hs
new file mode 100644
index 000000000..6ffc8ee3c
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Pptx
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of PPTX (PowerPoint) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Pptx (readPptx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Pptx.Parse (archiveToPptx)
+import Text.Pandoc.Readers.Pptx.Slides (pptxToOutput)
+
+-- | Read PPTX file into Pandoc AST
+readPptx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readPptx opts bytes =
+ case toArchiveOrFail bytes of
+ Right archive ->
+ case archiveToPptx archive of
+ Right pptx -> do
+ -- Convert Pptx intermediate to Pandoc AST
+ (meta, blocks) <- pptxToOutput opts pptx
+ return $ Pandoc meta blocks
+
+ Left err ->
+ throwError $ PandocParseError $
+ "Failed to parse PPTX: " <> err
+
+ Left err ->
+ throwError $ PandocParseError $
+ "Failed to unpack PPTX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Pptx/Parse.hs b/src/Text/Pandoc/Readers/Pptx/Parse.hs
new file mode 100644
index 000000000..34ab0728a
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Parse.hs
@@ -0,0 +1,216 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Pptx.Parse
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Parsing of PPTX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Pptx.Parse
+ ( Pptx(..)
+ , PresentationDoc(..)
+ , PptxSlide(..)
+ , SlideId(..)
+ , archiveToPptx
+ ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import qualified Data.ByteString.Lazy as B
+import Data.List (find)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Slide identifier
+newtype SlideId = SlideId Int deriving (Show, Eq, Ord)
+
+-- | Complete PPTX document (intermediate representation)
+data Pptx = Pptx
+ { pptxPresentation :: PresentationDoc
+ , pptxSlides :: [PptxSlide]
+ , pptxArchive :: Archive
+ } deriving (Show)
+
+-- | Individual slide data
+data PptxSlide = PptxSlide
+ { slideId :: SlideId
+ , slidePath :: FilePath
+ , slideElement :: Element -- The parsed p:sld element
+ , slideRels :: [(Text, Text)] -- Slide relationships
+ } deriving (Show)
+
+-- | Presentation-level information from presentation.xml
+data PresentationDoc = PresentationDoc
+ { presNameSpaces :: NameSpaces
+ , presSlideSize :: (Integer, Integer) -- (width, height) in pixels
+ , presSlideIds :: [(SlideId, Text)] -- (slideId, relationshipId)
+ } deriving (Show)
+
+-- | Parse PPTX archive to intermediate representation
+archiveToPptx :: Archive -> Either Text Pptx
+archiveToPptx archive = do
+ -- Find and parse presentation.xml
+ presPath <- getPresentationXmlPath archive
+ presElem <- loadXMLFromArchive archive presPath
+ presDoc <- elemToPresentation presElem
+
+ -- Load presentation relationships to resolve slide paths
+ presRelsPath <- getPresentationRelsPath archive presPath
+ presRels <- loadRelationships archive presRelsPath
+
+ -- Parse each slide
+ slides <- mapM (parseSlide archive presRels) (presSlideIds presDoc)
+
+ return $ Pptx presDoc slides archive
+
+-- | Find presentation.xml via root relationships
+getPresentationXmlPath :: Archive -> Either Text FilePath
+getPresentationXmlPath archive = do
+ -- Load _rels/.rels
+ relsEntry <- maybeToEither "Missing _rels/.rels" $
+ findEntryByPath "_rels/.rels" archive
+
+ relsElem <- parseXMLFromEntry relsEntry
+
+ -- The Relationships element has a default namespace, but Relationship children don't use prefix
+ -- We need to look at all children regardless of namespace
+ let relElems = onlyElems $ elContent relsElem
+
+ -- Look for relationship containing "officeDocument" in Type attribute
+ case find isOfficeDocRel relElems of
+ Nothing -> Left $ "No presentation.xml relationship found. Found " <>
+ T.pack (show (length relElems)) <> " relationships."
+ Just rel -> do
+ target <- maybeToEither "Missing Target attribute" $
+ findAttr (unqual "Target") rel
+ return $ T.unpack target -- Convert Text to FilePath
+
+ where
+ isOfficeDocRel el =
+ case findAttr (unqual "Type") el of
+ -- Must end with "/officeDocument" to avoid matching "/extended-properties"
+ Just relType -> "/officeDocument" `T.isSuffixOf` relType
+ Nothing -> False
+
+-- | Load and parse XML from archive entry
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+ entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+ findEntryByPath path archive
+
+ let xmlBytes = fromEntry entry
+ parseXMLFromBS xmlBytes
+
+-- | Parse XML from ByteString
+parseXMLFromBS :: B.ByteString -> Either Text Element
+parseXMLFromBS = parseXMLElement . TL.decodeUtf8
+
+-- | Parse XML from Entry
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry = parseXMLFromBS . fromEntry
+
+-- | Parse presentation.xml element to PresentationDoc
+elemToPresentation :: Element -> Either Text PresentationDoc
+elemToPresentation presElem = do
+ let ns = elemToNameSpaces presElem
+
+ -- Extract slide size (with defaults)
+ let sizeElem = findChildByName ns "p" "sldSz" presElem
+ (widthEMU, heightEMU) = case sizeElem of
+ Just el ->
+ let cx = readAttrInt "cx" el
+ cy = readAttrInt "cy" el
+ in (cx, cy)
+ Nothing -> (9144000, 6858000) -- Default 10" x 7.5"
+
+ -- Convert EMUs to pixels (approximate for metadata)
+ let width = widthEMU `div` emusPerInch
+ height = heightEMU `div` emusPerInch
+
+ -- Extract slide ID list (optional - some presentations may have no slides)
+ let sldIdLstElem = findChildByName ns "p" "sldIdLst" presElem
+
+ slideRefs <- case sldIdLstElem of
+ Nothing -> return [] -- No slides is valid for templates/masters-only presentations
+ Just el -> do
+ let sldIdElems = findChildren (elemName ns "p" "sldId") el
+ mapM (extractSlideRef ns) (zip [1..] sldIdElems)
+
+ return $ PresentationDoc
+ { presNameSpaces = ns
+ , presSlideSize = (width, height)
+ , presSlideIds = slideRefs
+ }
+
+-- | Extract slide ID and relationship ID from p:sldId element
+extractSlideRef :: NameSpaces -> (Int, Element) -> Either Text (SlideId, Text)
+extractSlideRef ns (idx, sldIdElem) = do
+ relId <- maybeToEither ("Missing r:id in slide " <> T.pack (show idx)) $
+ findAttrByName ns "r" "id" sldIdElem
+
+ return (SlideId idx, relId)
+
+-- | Safe read attribute as Integer (with default of 0)
+readAttrInt :: Text -> Element -> Integer
+readAttrInt attrName el =
+ case findAttr (unqual attrName) el of
+ Just str -> case readMaybe (T.unpack str) of
+ Just n -> n
+ Nothing -> 0
+ Nothing -> 0
+
+-- | Get presentation relationships path
+getPresentationRelsPath :: Archive -> FilePath -> Either Text FilePath
+getPresentationRelsPath _archive presPath =
+ -- ppt/presentation.xml → ppt/_rels/presentation.xml.rels
+ let (dir, file) = splitFileName presPath
+ relsPath = dir ++ "/_rels/" ++ file ++ ".rels"
+ in Right relsPath
+
+-- | Load relationships from .rels file
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+ case findEntryByPath relsPath archive of
+ Nothing -> Right [] -- No relationships is OK
+ Just entry -> do
+ relsElem <- parseXMLFromEntry entry
+ let relElems = onlyElems $ elContent relsElem
+ return $ mapMaybe extractRelationship relElems
+ where
+ extractRelationship el = do
+ relId <- findAttr (unqual "Id") el
+ target <- findAttr (unqual "Target") el
+ return (relId, target)
+
+-- | Parse a single slide
+parseSlide :: Archive -> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide
+parseSlide archive rels (sid, relId) = do
+ -- Resolve relationship to get slide path
+ target <- maybeToEither ("Relationship not found: " <> relId) $
+ lookup relId rels
+
+ -- Resolve relative path: ppt/slides/slide1.xml
+ let slidePath' = "ppt/" <> T.unpack target
+
+ -- Load and parse slide XML
+ slideElem <- loadXMLFromArchive archive slidePath'
+
+ -- Load slide-specific relationships
+ slideRelsPath <- getPresentationRelsPath archive slidePath'
+ slideRels' <- loadRelationships archive slideRelsPath
+
+ return $ PptxSlide sid slidePath' slideElem slideRels'
+
+-- | Helper: Maybe a -> Either Text a
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
diff --git a/src/Text/Pandoc/Readers/Pptx/Shapes.hs b/src/Text/Pandoc/Readers/Pptx/Shapes.hs
new file mode 100644
index 000000000..d316a7b22
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Shapes.hs
@@ -0,0 +1,330 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -Wno-partial-fields #-}
+{- |
+ Module : Text.Pandoc.Readers.Pptx.Shapes
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Parsing of PPTX shapes (text boxes, images, tables, diagrams).
+-}
+module Text.Pandoc.Readers.Pptx.Shapes
+ ( PptxShape(..)
+ , PptxParagraph(..)
+ , BulletType(..)
+ , parseShapes
+ , parseShape
+ , shapeToBlocks
+ , isTitlePlaceholder
+ , extractDrawingMLText
+ ) where
+
+import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry)
+import qualified Data.ByteString.Lazy as B
+import Data.List (find, groupBy)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Read (readMaybe)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import qualified Text.Pandoc.Class.PandocMonad as P
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Pptx.SmartArt
+import Text.Pandoc.XML.Light
+
+-- | Paragraph with bullet/numbering information
+data PptxParagraph = PptxParagraph
+ { paraLevel :: Int -- Bullet level (0, 1, 2...)
+ , paraBullet :: BulletType
+ , paraText :: Text
+ } deriving (Show)
+
+-- | Bullet type
+data BulletType
+ = NoBullet
+ | Bullet -- Has bullet (character detected or implicit)
+ | WingdingsBullet -- Detected via Wingdings symbol
+ deriving (Show, Eq)
+
+-- | Shape types in PPTX slides
+data PptxShape
+ = PptxTextBox [PptxParagraph] -- Parsed paragraphs with bullet info
+ | PptxPicture
+ { picRelId :: Text -- Relationship ID (lazy loading)
+ , picTitle :: Text
+ , picAlt :: Text
+ }
+ | PptxTable [[Text]] -- Simple text cells for now
+ | PptxDiagramRef
+ { dgmDataRelId :: Text -- Relationship to data.xml
+ , dgmLayoutRelId :: Text -- Relationship to layout.xml
+ }
+ | PptxGraphic Text -- Placeholder for other graphics
+ deriving (Show)
+
+-- | Parse all shapes from shape tree
+parseShapes :: NameSpaces -> Element -> [PptxShape]
+parseShapes ns spTreeElem =
+ let shapeElems = onlyElems $ elContent spTreeElem
+ -- Merge parent namespaces with element namespaces
+ ns' = ns <> elemToNameSpaces spTreeElem
+ in mapMaybe (parseShape ns') shapeElems
+
+-- | Parse individual shape element
+parseShape :: NameSpaces -> Element -> Maybe PptxShape
+parseShape ns el
+ -- Text box: <p:sp> with <p:txBody>
+ | isElem ns "p" "sp" el =
+ case findChildByName ns "p" "txBody" el of
+ Just txBody ->
+ let paras = parseParagraphs ns txBody
+ in if null paras
+ then Nothing
+ else Just $ PptxTextBox paras
+ Nothing -> Nothing
+
+ -- Picture: <p:pic>
+ | isElem ns "p" "pic" el = do
+ nvPicPr <- findChildByName ns "p" "nvPicPr" el
+ cNvPr <- findChildByName ns "p" "cNvPr" nvPicPr
+
+ let title = maybe "" id $ findAttr (unqual "name") cNvPr
+ alt = maybe "" id $ findAttr (unqual "descr") cNvPr
+
+ -- Get blip relationship ID
+ blipFill <- findChildByName ns "p" "blipFill" el
+ blip <- findChildByName ns "a" "blip" blipFill
+ relId <- findAttrByName ns "r" "embed" blip
+
+ return $ PptxPicture relId title alt
+
+ -- GraphicFrame: table or diagram
+ | isElem ns "p" "graphicFrame" el =
+ case findChildByName ns "a" "graphic" el >>=
+ findChildByName ns "a" "graphicData" of
+ Nothing -> Nothing
+ Just graphicData ->
+ case findAttr (unqual "uri") graphicData of
+ Nothing -> Just $ PptxGraphic "no-uri"
+ Just uri ->
+ if "table" `T.isInfixOf` uri
+ then
+ -- Table
+ case findChildByName ns "a" "tbl" graphicData of
+ Just tbl ->
+ let rows = parseTableRows ns tbl
+ in Just $ PptxTable rows
+ Nothing -> Nothing
+ else if "diagram" `T.isInfixOf` uri
+ then
+ -- SmartArt diagram - dgm namespace is declared inline on relIds element
+ let dgmRelIds = find (\e -> qName (elName e) == "relIds") (elChildren graphicData)
+ in case dgmRelIds of
+ Nothing -> Just $ PptxGraphic "diagram-no-relIds"
+ Just relIdsElem ->
+ -- Get r:dm and r:lo attributes (r namespace is in parent)
+ let ns' = ns <> elemToNameSpaces relIdsElem
+ in case (findAttrByName ns' "r" "dm" relIdsElem,
+ findAttrByName ns' "r" "lo" relIdsElem) of
+ (Just dataRelId, Just layoutRelId) ->
+ Just $ PptxDiagramRef dataRelId layoutRelId
+ _ -> Just $ PptxGraphic "diagram-missing-rels"
+ else
+ -- Other graphic (chart, etc.)
+ Just $ PptxGraphic ("other: " <> uri)
+
+ -- Skip other shapes for now
+ | otherwise = Nothing
+
+-- | Parse table rows (simple text extraction)
+parseTableRows :: NameSpaces -> Element -> [[Text]]
+parseTableRows ns tblElem =
+ let trElems = findChildrenByName ns "a" "tr" tblElem
+ in map (parseTableRow ns) trElems
+
+parseTableRow :: NameSpaces -> Element -> [Text]
+parseTableRow ns trElem =
+ let tcElems = findChildrenByName ns "a" "tc" trElem
+ in map extractCellText tcElems
+ where
+ extractCellText tcElem =
+ -- Get text from txBody/a:p/a:r/a:t
+ case findChildByName ns "a" "txBody" tcElem of
+ Just txBody -> extractDrawingMLText txBody
+ Nothing -> ""
+
+-- | Convert shape to Pandoc blocks
+shapeToBlocks :: PandocMonad m => Archive -> [(Text, Text)] -> PptxShape -> m [Block]
+shapeToBlocks _archive _rels (PptxTextBox paras) =
+ return $ paragraphsToBlocks paras
+shapeToBlocks archive rels (PptxPicture relId title alt) = do
+ -- Resolve relationship to get media path
+ case lookup relId rels of
+ Nothing -> return [] -- Image not found
+ Just target -> do
+ let mediaPath = resolveMediaPath target
+
+ -- Load image bytes and add to MediaBag
+ case loadMediaFromArchive archive mediaPath of
+ Nothing -> return []
+ Just mediaBytes -> do
+ P.insertMedia (T.unpack mediaPath) Nothing mediaBytes
+
+ let altText = if T.null alt then [] else [Str alt]
+ return [Para [Image nullAttr altText (mediaPath, title)]]
+
+shapeToBlocks _archive _rels (PptxTable rows) =
+ -- Simple table representation for now
+ case rows of
+ [] -> return []
+ (headerRow:bodyRows) -> do
+ let makeCell text = Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str text]]
+ headerCells = map makeCell headerRow
+ bodyCells = map (map makeCell) bodyRows
+ caption = Caption Nothing []
+ colSpec = replicate (length headerRow) (AlignDefault, ColWidthDefault)
+ headerRow' = Row nullAttr headerCells
+ bodyRows' = map (Row nullAttr) bodyCells
+ thead = TableHead nullAttr [headerRow']
+ tbody = [TableBody nullAttr 0 [] bodyRows']
+ tfoot = TableFoot nullAttr []
+ return [Table nullAttr caption colSpec thead tbody tfoot]
+
+shapeToBlocks archive rels (PptxDiagramRef dataRelId layoutRelId) = do
+ -- Parse SmartArt diagram
+ case parseDiagram archive rels dataRelId layoutRelId of
+ Left err -> do
+ -- Failed to parse diagram, return placeholder
+ return [Para [Str $ "[Diagram parse error: " <> err <> "]"]]
+ Right diagram ->
+ return $ diagramToBlocks diagram
+shapeToBlocks _archive _rels (PptxGraphic text) =
+ -- Placeholder for other graphics (charts, etc.)
+ return [Para [Str $ "[Graphic: " <> text <> "]"]]
+
+-- | Resolve media path (handle relative paths)
+resolveMediaPath :: Text -> Text
+resolveMediaPath target =
+ if "../media/" `T.isPrefixOf` target
+ then "ppt/media/" <> T.drop 9 target -- "../media/" = 9 chars
+ else if "media/" `T.isPrefixOf` target
+ then "ppt/" <> target
+ else target
+
+-- | Load media file from archive
+loadMediaFromArchive :: Archive -> Text -> Maybe B.ByteString
+loadMediaFromArchive archive path =
+ case findEntryByPath (T.unpack path) archive of
+ Just entry -> Just $ fromEntry entry
+ Nothing -> Nothing
+
+-- | Parse paragraphs from text box
+parseParagraphs :: NameSpaces -> Element -> [PptxParagraph]
+parseParagraphs ns txBody =
+ let pElems = findChildrenByName ns "a" "p" txBody
+ in map (parseParagraph ns) pElems
+
+-- | Parse individual paragraph
+parseParagraph :: NameSpaces -> Element -> PptxParagraph
+parseParagraph ns pElem =
+ let level = parseBulletLevel ns pElem
+ bullet = detectBulletType ns pElem
+ text = extractParagraphText ns pElem
+ in PptxParagraph level bullet text
+
+-- | Parse bullet level from paragraph properties
+parseBulletLevel :: NameSpaces -> Element -> Int
+parseBulletLevel ns pElem =
+ case findChildByName ns "a" "pPr" pElem >>=
+ findAttr (unqual "lvl") >>=
+ (\s -> readMaybe (T.unpack s) :: Maybe Int) of
+ Just lvl -> lvl
+ Nothing -> 0 -- Default to level 0
+
+-- | Detect bullet type
+detectBulletType :: NameSpaces -> Element -> BulletType
+detectBulletType ns pElem =
+ -- Check for explicit <a:pPr><a:buChar>
+ case findChildByName ns "a" "pPr" pElem >>=
+ findChildByName ns "a" "buChar" of
+ Just _buCharElem -> Bullet
+ Nothing ->
+ -- Check for Wingdings symbol (common in PowerPoint)
+ if hasWingdingsSymbol ns pElem
+ then WingdingsBullet
+ else NoBullet
+
+-- | Check if paragraph starts with Wingdings symbol
+hasWingdingsSymbol :: NameSpaces -> Element -> Bool
+hasWingdingsSymbol ns pElem =
+ let runs = findChildrenByName ns "a" "r" pElem
+ checkRun r = case findChildByName ns "a" "rPr" r >>=
+ findChildByName ns "a" "sym" of
+ Just symElem ->
+ case findAttr (unqual "typeface") symElem of
+ Just typeface -> "Wingdings" `T.isInfixOf` typeface
+ Nothing -> False
+ Nothing -> False
+ in any checkRun runs
+
+-- | Extract text from paragraph
+extractParagraphText :: NameSpaces -> Element -> Text
+extractParagraphText _ns pElem =
+ -- Find all <a:t> elements and concatenate
+ let textElems = filterElementsName (\qn -> qName qn == "t") pElem
+ texts = map strContent textElems
+ in T.unwords $ filter (not . T.null) texts
+
+-- | Extract text from DrawingML element (finds all <a:t> descendants)
+extractDrawingMLText :: Element -> Text
+extractDrawingMLText el =
+ let textElems = filterElementsName (\qn -> qName qn == "t") el
+ texts = map strContent textElems
+ in T.unwords $ filter (not . T.null) texts
+
+-- | Convert paragraphs to blocks, grouping bullets into lists
+paragraphsToBlocks :: [PptxParagraph] -> [Block]
+paragraphsToBlocks paras =
+ -- If we have multiple paragraphs with bullets, group them
+ let hasBullets = any (\p -> paraBullet p /= NoBullet) paras
+ in if hasBullets
+ then groupBulletParagraphs paras
+ else map (\p -> Para [Str $ paraText p]) paras
+
+-- | Group bullet paragraphs into lists
+groupBulletParagraphs :: [PptxParagraph] -> [Block]
+groupBulletParagraphs paras =
+ let grouped = groupBy sameBulletLevel paras
+ in concatMap groupToBlock grouped
+ where
+ sameBulletLevel p1 p2 =
+ (paraBullet p1 /= NoBullet) &&
+ (paraBullet p2 /= NoBullet) &&
+ (paraLevel p1 == paraLevel p2)
+
+ groupToBlock :: [PptxParagraph] -> [Block]
+ groupToBlock [] = []
+ groupToBlock ps@(p:_)
+ | paraBullet p /= NoBullet =
+ -- Bullet list
+ let items = map (\para -> [Plain [Str $ paraText para]]) ps
+ in [BulletList items]
+ | otherwise =
+ -- Plain paragraph
+ map (\para -> Para [Str $ paraText para]) ps
+
+-- | Check if shape is title placeholder (also used in Slides module)
+isTitlePlaceholder :: NameSpaces -> Element -> Bool
+isTitlePlaceholder ns el =
+ case findChildByName ns "p" "nvSpPr" el >>=
+ findChildByName ns "p" "nvPr" >>=
+ findChildByName ns "p" "ph" of
+ Just phElem ->
+ case findAttr (unqual "type") phElem of
+ Just phType -> phType == "title" || phType == "ctrTitle"
+ Nothing -> False
+ Nothing -> False
diff --git a/src/Text/Pandoc/Readers/Pptx/Slides.hs b/src/Text/Pandoc/Readers/Pptx/Slides.hs
new file mode 100644
index 000000000..5e24af26f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Slides.hs
@@ -0,0 +1,83 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Pptx.Slides
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Conversion of PPTX slides to Pandoc AST blocks.
+-}
+module Text.Pandoc.Readers.Pptx.Slides
+ ( pptxToOutput
+ ) where
+
+import Codec.Archive.Zip (Archive)
+import Data.List (find)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Pptx.Parse
+import Text.Pandoc.Readers.Pptx.Shapes
+import Text.Pandoc.XML.Light
+
+-- | Convert Pptx intermediate representation to Pandoc AST
+pptxToOutput :: PandocMonad m => ReaderOptions -> Pptx -> m (Meta, [Block])
+pptxToOutput _opts pptx = do
+ let slides = pptxSlides pptx
+ archive = pptxArchive pptx
+
+ -- Convert each slide to blocks
+ slideBlocks <- concat <$> mapM (slideToBlocks archive) slides
+
+ return (mempty, slideBlocks)
+
+-- | Convert slide to blocks
+slideToBlocks :: PandocMonad m => Archive -> PptxSlide -> m [Block]
+slideToBlocks archive slide = do
+ let SlideId n = slideId slide
+ slideElem = slideElement slide
+ rels = slideRels slide
+ ns = elemToNameSpaces slideElem
+
+ -- Extract title from title placeholder
+ title = extractSlideTitle ns slideElem
+
+ -- Create header
+ slideIdent = "slide-" <> T.pack (show n)
+ headerText = if T.null title
+ then "Slide " <> T.pack (show n)
+ else title
+ header = Header 2 (slideIdent, [], []) [Str headerText]
+
+ -- Parse shapes and convert to blocks
+ case findChildByName ns "p" "cSld" slideElem >>=
+ findChildByName ns "p" "spTree" of
+ Nothing -> return [header]
+ Just spTree -> do
+ -- Filter out title placeholder shapes before parsing
+ let allShapeElems = onlyElems $ elContent spTree
+ nonTitleShapeElems = filter (not . isTitlePlaceholder ns) allShapeElems
+ shapes = mapMaybe (parseShape ns) nonTitleShapeElems
+ shapeBlocks <- concat <$> mapM (shapeToBlocks archive rels) shapes
+ return $ header : shapeBlocks
+
+-- | Extract title from title placeholder
+extractSlideTitle :: NameSpaces -> Element -> Text
+extractSlideTitle ns slideElem =
+ case findChildByName ns "p" "cSld" slideElem >>=
+ findChildByName ns "p" "spTree" of
+ Nothing -> ""
+ Just spTree ->
+ -- Find shape with ph type="title"
+ let shapes = onlyElems $ elContent spTree
+ titleShape = find (isTitlePlaceholder ns) shapes
+ in maybe "" extractDrawingMLText titleShape
+
+-- isTitlePlaceholder is imported from Shapes module
diff --git a/src/Text/Pandoc/Readers/Pptx/SmartArt.hs b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs
new file mode 100644
index 000000000..64e4a8649
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs
@@ -0,0 +1,220 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.Readers.Pptx.SmartArt
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+SmartArt diagram parsing and text extraction for PPTX.
+-}
+module Text.Pandoc.Readers.Pptx.SmartArt
+ ( PptxDiagram(..)
+ , parseDiagram
+ , diagramToBlocks
+ ) where
+
+import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.XML.Light
+
+-- | SmartArt diagram data
+data PptxDiagram = PptxDiagram
+ { diagramType :: Text -- Layout type (chevron, cycle, etc.)
+ , diagramNodes :: [(Text, [Text])] -- (nodeText, childTexts)
+ } deriving (Show)
+
+-- | Parse SmartArt diagram from relationship IDs
+parseDiagram :: Archive
+ -> [(Text, Text)] -- Slide relationships
+ -> Text -- data relationship ID
+ -> Text -- layout relationship ID
+ -> Either Text PptxDiagram
+parseDiagram archive rels dataRelId layoutRelId = do
+ -- Resolve relationships to file paths
+ dataTarget <- maybeToEither ("Relationship not found: " <> dataRelId) $
+ lookup dataRelId rels
+ layoutTarget <- maybeToEither ("Relationship not found: " <> layoutRelId) $
+ lookup layoutRelId rels
+
+ -- Resolve relative paths (diagrams are in ../diagrams/ from slides/)
+ let dataPath = resolveDiagramPath dataTarget
+ layoutPath = resolveDiagramPath layoutTarget
+
+ -- Load XML files
+ dataElem <- loadXMLFromArchive archive dataPath
+ layoutElem <- loadXMLFromArchive archive layoutPath
+
+ -- Extract layout type
+ layoutType <- extractLayoutType layoutElem
+
+ -- Extract text nodes with hierarchy
+ nodes <- extractDiagramNodes dataElem
+
+ return $ PptxDiagram layoutType nodes
+
+-- | Resolve diagram path (handle ../diagrams/ relative paths)
+resolveDiagramPath :: Text -> FilePath
+resolveDiagramPath target =
+ if "../diagrams/" `T.isPrefixOf` target
+ then "ppt/diagrams/" ++ T.unpack (T.drop 12 target) -- "../diagrams/" = 12 chars
+ else T.unpack target
+
+-- | Load XML from archive
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path =
+ case findEntryByPath path archive of
+ Nothing -> Left $ "File not found in archive: " <> T.pack path
+ Just entry ->
+ let xmlBytes = fromEntry entry
+ lazyText = TL.decodeUtf8 xmlBytes
+ in parseXMLElement lazyText
+
+-- | Extract layout type from layout XML
+extractLayoutType :: Element -> Either Text Text
+extractLayoutType layoutElem = do
+ -- Look for uniqueId attribute: "urn:.../layout/chevron2"
+ case findAttr (unqual "uniqueId") layoutElem of
+ Just uid ->
+ -- Extract last part after last /
+ let layoutName = T.takeWhileEnd (/= '/') uid
+ in Right layoutName
+ Nothing ->
+ -- Fallback: look for title
+ case findChildByName ns "dgm" "title" layoutElem >>=
+ findAttr (unqual "val") of
+ Just title -> Right title
+ Nothing -> Right "unknown"
+ where
+ ns = elemToNameSpaces layoutElem
+
+-- | Extract text nodes from diagram data
+extractDiagramNodes :: Element -> Either Text [(Text, [Text])]
+extractDiagramNodes dataElem = do
+ let ns = elemToNameSpaces dataElem
+
+ -- Find point list
+ ptLst <- maybeToEither "Missing dgm:ptLst" $
+ findChildByName ns "dgm" "ptLst" dataElem
+
+ let ptElems = findChildrenByName ns "dgm" "pt" ptLst
+
+ -- Build node map: modelId → text
+ let nodeMap = M.fromList $ mapMaybe (extractNodeText ns) ptElems
+
+ -- Parse connections
+ let cxnLst = findChildByName ns "dgm" "cxnLst" dataElem
+ connections = maybe [] (parseConnections ns) cxnLst
+
+ -- Build parent-child map
+ let parentMap = buildParentMap connections
+
+ -- Find parent nodes (nodes that have children)
+ let parentIds = M.keys parentMap
+
+ -- Build hierarchy - only show nodes that are parents
+ -- (children are shown under their parents)
+ let hierarchy = map (buildNodeWithChildren nodeMap parentMap) parentIds
+ -- Filter out nodes with empty text (presentation nodes)
+ validHierarchy = filter (\(nodeText, _) -> not $ T.null nodeText) hierarchy
+
+ return validHierarchy
+
+-- | Extract text from a point element (returns Nothing if no text)
+extractNodeText :: NameSpaces -> Element -> Maybe (Text, Text)
+extractNodeText ns ptElem = do
+ modelId <- findAttr (unqual "modelId") ptElem
+
+ -- Extract text from dgm:t element (which contains a:p/a:r/a:t)
+ let text = case findChildByName ns "dgm" "t" ptElem of
+ Just tElem ->
+ -- Recursively get ALL text content from all descendants
+ getAllText tElem
+ Nothing -> ""
+
+ -- Only return nodes with actual text
+ if T.null (T.strip text)
+ then Nothing
+ else return (modelId, text)
+
+-- | Connection between nodes
+data Connection = Connection
+ { connType :: Text
+ , connSrc :: Text
+ , connDest :: Text
+ } deriving (Show)
+
+-- | Parse connections
+parseConnections :: NameSpaces -> Element -> [Connection]
+parseConnections ns cxnLst =
+ let cxnElems = findChildrenByName ns "dgm" "cxn" cxnLst
+ in mapMaybe (parseConnection ns) cxnElems
+
+parseConnection :: NameSpaces -> Element -> Maybe Connection
+parseConnection _ns cxnElem = do
+ let cxnType = maybe "" id $ findAttr (unqual "type") cxnElem -- Empty if no type
+ srcId <- findAttr (unqual "srcId") cxnElem
+ destId <- findAttr (unqual "destId") cxnElem
+ return $ Connection cxnType srcId destId
+
+-- | Build parent-child map from connections
+-- Use connections WITHOUT a type attribute (these are the data hierarchy)
+buildParentMap :: [Connection] -> M.Map Text [Text]
+buildParentMap connections =
+ let dataConnections = filter (\c -> T.null (connType c)) connections
+ in foldr addConn M.empty dataConnections
+ where
+ addConn conn m = M.insertWith (++) (connSrc conn) [connDest conn] m
+
+-- | Build node with its children
+buildNodeWithChildren :: M.Map Text Text -> M.Map Text [Text] -> Text -> (Text, [Text])
+buildNodeWithChildren nodeMap parentMap nodeId =
+ let nodeText = M.findWithDefault "" nodeId nodeMap
+ childIds = M.findWithDefault [] nodeId parentMap
+ -- Only include children that have text
+ childTexts = filter (not . T.null) $
+ map (\cid -> M.findWithDefault "" cid nodeMap) childIds
+ in (nodeText, childTexts)
+
+-- | Convert diagram to Pandoc blocks
+diagramToBlocks :: PptxDiagram -> [Block]
+diagramToBlocks diagram =
+ let nodes = diagramNodes diagram
+ layoutType = diagramType diagram
+
+ -- Build content blocks
+ contentBlocks = concatMap nodeToBlocks nodes
+
+ in [Div ("", ["smartart", layoutType], [("layout", layoutType)])
+ contentBlocks]
+
+-- | Convert node to blocks
+nodeToBlocks :: (Text, [Text]) -> [Block]
+nodeToBlocks (nodeText, childTexts) =
+ if null childTexts
+ then [Para [Strong [Str nodeText]]]
+ else [ Para [Strong [Str nodeText]]
+ , BulletList [[Plain [Str child]] | child <- childTexts]
+ ]
+
+-- | Recursively extract all text from an element and its descendants
+getAllText :: Element -> Text
+getAllText el =
+ let textFromContent (Text cdata) = cdData cdata
+ textFromContent (Elem e) = getAllText e
+ textFromContent _ = ""
+ texts = map textFromContent (elContent el)
+ in T.unwords $ filter (not . T.null) texts
+
+-- Helper functions
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs
new file mode 100644
index 000000000..613d5b50f
--- /dev/null
+++ b/test/Tests/Readers/Pptx.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Tests.Readers.Pptx
+ Copyright : © 2025 Anton Antic
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Anton Antic <[email protected]>
+ Stability : alpha
+ Portability : portable
+
+Tests for the PPTX reader.
+-}
+module Tests.Readers.Pptx (tests) where
+
+import Data.Algorithm.Diff (getDiff)
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Test.Tasty
+import Test.Tasty.Golden.Advanced
+import Tests.Helpers
+import Text.Pandoc
+import Text.Pandoc.UTF8 as UTF8
+
+defopts :: ReaderOptions
+defopts = def{ readerExtensions = getDefaultExtensions "pptx" }
+
+testCompare :: String -> FilePath -> FilePath -> TestTree
+testCompare = testCompareWithOpts defopts
+
+nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
+nativeDiff normPath expectedNative actualNative
+ | expectedNative == actualNative = return Nothing
+ | otherwise = Just <$> do
+ expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
+ actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
+ let dash = replicate 72 '-'
+ let diff = getDiff (lines actual) (lines expected)
+ return $ '\n' : dash ++
+ "\n--- " ++ normPath ++
+ "\n+++ " ++ "test" ++ "\n" ++
+ showDiff (1,1) diff ++ dash
+
+testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
+testCompareWithOpts opts testName pptxFP nativeFP =
+ goldenTest
+ testName
+ (do nf <- UTF8.toText <$> BS.readFile nativeFP
+ runIOorExplode (readNative def nf))
+ (do df <- B.readFile pptxFP
+ runIOorExplode (readPptx opts df))
+ (nativeDiff nativeFP)
+ (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a)
+ >>= BS.writeFile nativeFP . UTF8.fromText)
+
+tests :: [TestTree]
+tests = [ testGroup "basic"
+ [ testCompare
+ "text extraction"
+ "pptx-reader/basic.pptx"
+ "pptx-reader/basic.native"
+ ]
+ ]
diff --git a/test/pptx-reader/basic.native b/test/pptx-reader/basic.native
new file mode 100644
index 000000000..954cb9345
--- /dev/null
+++ b/test/pptx-reader/basic.native
@@ -0,0 +1,149 @@
+[ Header 2 ( "slide-1" , [] , [] ) [ Str "LLMs" ]
+, BulletList
+ [ [ Plain
+ [ Str
+ "Provider \61664 Available LLMs \8211 who manages? How?"
+ ]
+ ]
+ , [ Plain
+ [ Str
+ "EW maintained list of \8220approved\8221 LLMs for Universal workers"
+ ]
+ ]
+ , [ Plain
+ [ Str
+ "Rebuilding of UWs to the \8220Newgen\8221 thing completely"
+ ]
+ ]
+ , [ Plain [ Str "Streaming support" ] ]
+ , [ Plain [ Str "Multimodal (voice streaming) models?" ] ]
+ ]
+, Header
+ 2
+ ( "slide-2" , [] , [] )
+ [ Str "Everworker venn diagram" ]
+, Para [ Str "SKILLS" ]
+, Para [ Str "" ]
+, Para [ Str "Specialized Workers / Workflows:" ]
+, Para [ Str "" ]
+, Para [ Str "n8n, UI Path, " ]
+, Para [ Str "other RPA" ]
+, Para [ Str "BRAINS" ]
+, Para [ Str "" ]
+, Para [ Str "Universal Workers / AI Agents:" ]
+, Para [ Str "" ]
+, Para [ Str "openai , anthropic," ]
+, Para [ Str "Crew AI, other " ]
+, Para [ Str "\8220AI natives\8221" ]
+, Para [ Str "KNOWLEDGE " ]
+, Para [ Str "" ]
+, Para [ Str "Data / " ]
+, Para [ Str "RAG Pipelines" ]
+, Para [ Str "" ]
+, Para
+ [ Str "Vector DBs, specialized data prep vendors, \8230" ]
+, Para [ Str "glean" ]
+, Para [ Str "EW" ]
+, Header 2 ( "slide-3" , [] , [] ) [ Str "Table" ]
+, Table
+ ( "" , [] , [] )
+ (Caption Nothing [])
+ [ ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ , ( AlignDefault , ColWidthDefault )
+ ]
+ (TableHead
+ ( "" , [] , [] )
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Col1" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Col2" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Col3" ] ]
+ ]
+ ])
+ [ TableBody
+ ( "" , [] , [] )
+ (RowHeadColumns 0)
+ []
+ [ Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Name" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Anton" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Antich" ] ]
+ ]
+ , Row
+ ( "" , [] , [] )
+ [ Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "Age" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "23" ] ]
+ , Cell
+ ( "" , [] , [] )
+ AlignDefault
+ (RowSpan 1)
+ (ColSpan 1)
+ [ Plain [ Str "years" ] ]
+ ]
+ ]
+ ]
+ (TableFoot ( "" , [] , [] ) [])
+, Para
+ [ Image
+ ( "" , [] , [] ) [] ( "ppt/media/image1.png" , "Picture 6" )
+ ]
+, Header 2 ( "slide-4" , [] , [] ) [ Str "Smart Art" ]
+, Div
+ ( ""
+ , [ "smartart" , "chevron2" ]
+ , [ ( "layout" , "chevron2" ) ]
+ )
+ [ Para [ Strong [ Str "First" ] ]
+ , BulletList
+ [ [ Plain [ Str "another" ] ]
+ , [ Plain [ Str "subtitle" ] ]
+ ]
+ , Para [ Strong [ Str "Second" ] ]
+ , BulletList
+ [ [ Plain [ Str "and yet again" ] ]
+ , [ Plain [ Str "yet more" ] ]
+ ]
+ ]
+]
diff --git a/test/pptx-reader/basic.pptx b/test/pptx-reader/basic.pptx
new file mode 100644
index 000000000..44caef9c3
--- /dev/null
+++ b/test/pptx-reader/basic.pptx
Binary files differ
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index 80d4ada7f..0d04b361f 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -12,6 +12,7 @@ import qualified Tests.Command
import qualified Tests.Old
import qualified Tests.Readers.Creole
import qualified Tests.Readers.Docx
+import qualified Tests.Readers.Pptx
import qualified Tests.Readers.DokuWiki
import qualified Tests.Readers.EPUB
import qualified Tests.Readers.FB2
@@ -95,6 +96,7 @@ tests pandocPath = testGroup "pandoc tests"
, testGroup "RST" Tests.Readers.RST.tests
, testGroup "RTF" Tests.Readers.RTF.tests
, testGroup "Docx" Tests.Readers.Docx.tests
+ , testGroup "Pptx" Tests.Readers.Pptx.tests
, testGroup "ODT" Tests.Readers.ODT.tests
, testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests
, testGroup "EPUB" Tests.Readers.EPUB.tests