Support pptx (PowerPoint) as an input format.

New module `Text.Pandoc.Readers.Pptx`, exporting `readPptx`. [API change] Factored out some common OOXML functions from Text.Pandoc.Readers.Docx.Util into a non-exported module Text.Pandoc.Readers.OOXML.Shared.
author: Anton Antich <[email protected]> 2025-11-09 13:48:14 +0100
committer: John MacFarlane <[email protected]> 2025-11-24 22:37:39 +0100
commit: ec75b693e5618c12ddac872d48e084436f1e1b48 (patch)
tree: 942edb1022128c5f769b928ee5ba3b0a6ee6b4f7
parent: 5d3b2916d616902d205146c108629053516fd9f4 (diff)
14 files changed, 1218 insertions, 44 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index 8f9ba8b0e..d394b2f6d 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -273,6 +273,7 @@ header when requesting a document from a URL:
     - `opml` ([OPML])
     - `org` ([Emacs Org mode])
     - `pod` (Perl's [Plain Old Documentation])
+    - `pptx` ([PowerPoint])
     - `ris` ([RIS] bibliography)
     - `rtf` ([Rich Text Format])
     - `rst` ([reStructuredText])
diff --git a/pandoc.cabal b/pandoc.cabal
index a5f21dbb2..f648dce11 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -426,6 +426,8 @@ extra-source-files:
                  test/pptx/*.pptx
                  test/pptx/**/*.pptx
                  test/pptx/**/*.native
+                 test/pptx-reader/basic.pptx
+                 test/pptx-reader/basic.native
                  test/ipynb/*.native
                  test/ipynb/*.in.native
                  test/ipynb/*.out.native
@@ -610,6 +612,7 @@ library
                    Text.Pandoc.Readers.TikiWiki,
                    Text.Pandoc.Readers.Txt2Tags,
                    Text.Pandoc.Readers.Docx,
+                   Text.Pandoc.Readers.Pptx,
                    Text.Pandoc.Readers.ODT,
                    Text.Pandoc.Readers.EPUB,
                    Text.Pandoc.Readers.Muse,
@@ -718,6 +721,11 @@ library
                    Text.Pandoc.Readers.Docx.Util,
                    Text.Pandoc.Readers.Docx.Symbols,
                    Text.Pandoc.Readers.Docx.Fields,
+                   Text.Pandoc.Readers.OOXML.Shared,
+                   Text.Pandoc.Readers.Pptx.Parse,
+                   Text.Pandoc.Readers.Pptx.Shapes,
+                   Text.Pandoc.Readers.Pptx.Slides,
+                   Text.Pandoc.Readers.Pptx.SmartArt,
                    Text.Pandoc.Readers.HTML.Parsing,
                    Text.Pandoc.Readers.HTML.Table,
                    Text.Pandoc.Readers.HTML.TagCategories,
@@ -854,6 +862,7 @@ test-suite test-pandoc
                   Tests.Readers.RST
                   Tests.Readers.RTF
                   Tests.Readers.Docx
+                  Tests.Readers.Pptx
                   Tests.Readers.ODT
                   Tests.Readers.Txt2Tags
                   Tests.Readers.EPUB
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 12d1c6c95..5f7b891e2 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -26,6 +26,7 @@ module Text.Pandoc.Readers
     Reader (..)
   , readers
   , readDocx
+  , readPptx
   , readODT
   , readMarkdown
   , readCommonMark
@@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown
 import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
+import Text.Pandoc.Readers.Pptx
 import Text.Pandoc.Readers.DokuWiki
 import Text.Pandoc.Readers.EPUB
 import Text.Pandoc.Readers.FB2
@@ -157,6 +159,7 @@ readers = [("native"       , TextReader readNative)
           ,("twiki"        , TextReader readTWiki)
           ,("tikiwiki"     , TextReader readTikiWiki)
           ,("docx"         , ByteStringReader readDocx)
+          ,("pptx"         , ByteStringReader readPptx)
           ,("odt"          , ByteStringReader readODT)
           ,("t2t"          , TextReader readTxt2Tags)
           ,("epub"         , ByteStringReader readEPUB)
diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
index 88c1973f9..b8c51534b 100644
--- a/src/Text/Pandoc/Readers/Docx/Util.hs
+++ b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util (
                                       , extractChildren
                                       ) where
 
-import qualified Data.Text as T
-import Data.Text (Text)
-import Text.Pandoc.XML.Light
-import qualified Data.Map as M
 import Data.List (partition)
-
-type NameSpaces = M.Map Text Text
-
-elemToNameSpaces :: Element -> NameSpaces
-elemToNameSpaces = foldr (\(Attr qn val) ->
-                             case qn of
-                               QName s _ (Just "xmlns") -> M.insert s val
-                               _ -> id) mempty . elAttribs
-
-elemName :: NameSpaces -> Text -> Text -> QName
-elemName ns prefix name =
-  QName name (M.lookup prefix ns)
-             (if T.null prefix then Nothing else Just prefix)
-
-isElem :: NameSpaces -> Text -> Text -> Element -> Bool
-isElem ns prefix name element =
-  let ns' = ns <> elemToNameSpaces element
-  in qName (elName element) == name &&
-     qURI (elName element) == M.lookup prefix ns'
-
-findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findChildByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findChild (elemName ns' pref name) el
-
-findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
-findChildrenByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findChildren (elemName ns' pref name) el
-
--- | Like 'findChildrenByName', but searches descendants.
-findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findElementByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findElement (elemName ns' pref name) el
-
-findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
-findAttrByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findAttr (elemName ns' pref name) el
+import Text.Pandoc.XML.Light
+import Text.Pandoc.Readers.OOXML.Shared
+  (NameSpaces, elemName, isElem, elemToNameSpaces,
+   findChildByName, findChildrenByName, findElementByName, findAttrByName)
 
 
 -- | Removes child elements that satisfy a given condition.
diff --git a/src/Text/Pandoc/Readers/OOXML/Shared.hs b/src/Text/Pandoc/Readers/OOXML/Shared.hs
new file mode 100644
index 000000000..0aa07a736
--- /dev/null
+++ b/src/Text/Pandoc/Readers/OOXML/Shared.hs
@@ -0,0 +1,95 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.OOXML.Shared
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX).
+Provides common functions for ZIP archive handling, XML parsing,
+namespace management, and DrawingML parsing.
+-}
+module Text.Pandoc.Readers.OOXML.Shared
+  ( -- * Constants
+    emusPerInch
+  , emuToInches
+  , inchesToEmu
+    -- * Types
+  , NameSpaces
+  , elemName
+  , elemToNameSpaces
+  , isElem
+  , findChildByName
+  , findChildrenByName
+  , findElementByName
+  , findAttrByName
+  ) where
+
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Pandoc.XML.Light
+
+-- | Type alias for namespace mappings
+type NameSpaces = M.Map Text Text
+
+-- | English Metric Units per inch
+-- 1 inch = 914400 EMUs (used in OOXML for dimensions)
+emusPerInch :: Integer
+emusPerInch = 914400
+
+-- | Convert EMUs to inches
+emuToInches :: Integer -> Double
+emuToInches n = fromIntegral n / fromIntegral emusPerInch
+
+-- | Convert inches to EMUs
+inchesToEmu :: Double -> Integer
+inchesToEmu n = round (n * fromIntegral emusPerInch)
+
+-- | Extract namespace declarations from element attributes
+elemToNameSpaces :: Element -> NameSpaces
+elemToNameSpaces = foldr (\(Attr qn val) ->
+                            case qn of
+                              QName s _ (Just "xmlns") -> M.insert s val
+                              _ -> id) mempty . elAttribs
+
+-- | Create a qualified name from namespace map, prefix, and local name
+elemName :: NameSpaces -> Text -> Text -> QName
+elemName ns prefix name =
+  QName name
+        (M.lookup prefix ns)
+        (if T.null prefix then Nothing else Just prefix)
+
+-- | Check if element matches namespace prefix and local name
+isElem :: NameSpaces -> Text -> Text -> Element -> Bool
+isElem ns prefix name element =
+  let ns' = ns <> elemToNameSpaces element
+  in  qName (elName element) == name &&
+      qURI (elName element) == M.lookup prefix ns'
+
+-- | Find first child element matching namespace and name
+findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findChildByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findChild (elemName ns' pref name) el
+
+-- | Find all children matching namespace and name
+findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
+findChildrenByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findChildren (elemName ns' pref name) el
+
+-- | Find element anywhere in descendants matching namespace and name
+findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findElementByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findElement (elemName ns' pref name) el
+
+-- | Find attribute value by namespace prefix and name
+findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
+findAttrByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findAttr (elemName ns' pref name) el
diff --git a/src/Text/Pandoc/Readers/Pptx.hs b/src/Text/Pandoc/Readers/Pptx.hs
new file mode 100644
index 000000000..6ffc8ee3c
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Pptx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of PPTX (PowerPoint) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Pptx (readPptx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Pptx.Parse (archiveToPptx)
+import Text.Pandoc.Readers.Pptx.Slides (pptxToOutput)
+
+-- | Read PPTX file into Pandoc AST
+readPptx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readPptx opts bytes =
+  case toArchiveOrFail bytes of
+    Right archive ->
+      case archiveToPptx archive of
+        Right pptx -> do
+          -- Convert Pptx intermediate to Pandoc AST
+          (meta, blocks) <- pptxToOutput opts pptx
+          return $ Pandoc meta blocks
+
+        Left err ->
+          throwError $ PandocParseError $
+            "Failed to parse PPTX: " <> err
+
+    Left err ->
+      throwError $ PandocParseError $
+        "Failed to unpack PPTX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Pptx/Parse.hs b/src/Text/Pandoc/Readers/Pptx/Parse.hs
new file mode 100644
index 000000000..34ab0728a
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Parse.hs
@@ -0,0 +1,216 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Pptx.Parse
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Parsing of PPTX archive to intermediate representation.
+-}
+module Text.Pandoc.Readers.Pptx.Parse
+  ( Pptx(..)
+  , PresentationDoc(..)
+  , PptxSlide(..)
+  , SlideId(..)
+  , archiveToPptx
+  ) where
+
+import Codec.Archive.Zip (Archive, Entry, findEntryByPath, fromEntry)
+import qualified Data.ByteString.Lazy as B
+import Data.List (find)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import System.FilePath (splitFileName)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.XML.Light
+import Text.Read (readMaybe)
+
+-- | Slide identifier
+newtype SlideId = SlideId Int deriving (Show, Eq, Ord)
+
+-- | Complete PPTX document (intermediate representation)
+data Pptx = Pptx
+  { pptxPresentation :: PresentationDoc
+  , pptxSlides       :: [PptxSlide]
+  , pptxArchive      :: Archive
+  } deriving (Show)
+
+-- | Individual slide data
+data PptxSlide = PptxSlide
+  { slideId      :: SlideId
+  , slidePath    :: FilePath
+  , slideElement :: Element     -- The parsed p:sld element
+  , slideRels    :: [(Text, Text)]  -- Slide relationships
+  } deriving (Show)
+
+-- | Presentation-level information from presentation.xml
+data PresentationDoc = PresentationDoc
+  { presNameSpaces   :: NameSpaces
+  , presSlideSize    :: (Integer, Integer)  -- (width, height) in pixels
+  , presSlideIds     :: [(SlideId, Text)]   -- (slideId, relationshipId)
+  } deriving (Show)
+
+-- | Parse PPTX archive to intermediate representation
+archiveToPptx :: Archive -> Either Text Pptx
+archiveToPptx archive = do
+  -- Find and parse presentation.xml
+  presPath <- getPresentationXmlPath archive
+  presElem <- loadXMLFromArchive archive presPath
+  presDoc <- elemToPresentation presElem
+
+  -- Load presentation relationships to resolve slide paths
+  presRelsPath <- getPresentationRelsPath archive presPath
+  presRels <- loadRelationships archive presRelsPath
+
+  -- Parse each slide
+  slides <- mapM (parseSlide archive presRels) (presSlideIds presDoc)
+
+  return $ Pptx presDoc slides archive
+
+-- | Find presentation.xml via root relationships
+getPresentationXmlPath :: Archive -> Either Text FilePath
+getPresentationXmlPath archive = do
+  -- Load _rels/.rels
+  relsEntry <- maybeToEither "Missing _rels/.rels" $
+               findEntryByPath "_rels/.rels" archive
+
+  relsElem <- parseXMLFromEntry relsEntry
+
+  -- The Relationships element has a default namespace, but Relationship children don't use prefix
+  -- We need to look at all children regardless of namespace
+  let relElems = onlyElems $ elContent relsElem
+
+  -- Look for relationship containing "officeDocument" in Type attribute
+  case find isOfficeDocRel relElems of
+    Nothing -> Left $ "No presentation.xml relationship found. Found " <>
+                     T.pack (show (length relElems)) <> " relationships."
+    Just rel -> do
+      target <- maybeToEither "Missing Target attribute" $
+                findAttr (unqual "Target") rel
+      return $ T.unpack target  -- Convert Text to FilePath
+
+  where
+    isOfficeDocRel el =
+      case findAttr (unqual "Type") el of
+        -- Must end with "/officeDocument" to avoid matching "/extended-properties"
+        Just relType -> "/officeDocument" `T.isSuffixOf` relType
+        Nothing -> False
+
+-- | Load and parse XML from archive entry
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path = do
+  entry <- maybeToEither ("Entry not found: " <> T.pack path) $
+           findEntryByPath path archive
+
+  let xmlBytes = fromEntry entry
+  parseXMLFromBS xmlBytes
+
+-- | Parse XML from ByteString
+parseXMLFromBS :: B.ByteString -> Either Text Element
+parseXMLFromBS = parseXMLElement . TL.decodeUtf8
+
+-- | Parse XML from Entry
+parseXMLFromEntry :: Entry -> Either Text Element
+parseXMLFromEntry = parseXMLFromBS . fromEntry
+
+-- | Parse presentation.xml element to PresentationDoc
+elemToPresentation :: Element -> Either Text PresentationDoc
+elemToPresentation presElem = do
+  let ns = elemToNameSpaces presElem
+
+  -- Extract slide size (with defaults)
+  let sizeElem = findChildByName ns "p" "sldSz" presElem
+      (widthEMU, heightEMU) = case sizeElem of
+        Just el ->
+          let cx = readAttrInt "cx" el
+              cy = readAttrInt "cy" el
+           in (cx, cy)
+        Nothing -> (9144000, 6858000)  -- Default 10" x 7.5"
+
+  -- Convert EMUs to pixels (approximate for metadata)
+  let width = widthEMU `div` emusPerInch
+      height = heightEMU `div` emusPerInch
+
+  -- Extract slide ID list (optional - some presentations may have no slides)
+  let sldIdLstElem = findChildByName ns "p" "sldIdLst" presElem
+
+  slideRefs <- case sldIdLstElem of
+    Nothing -> return []  -- No slides is valid for templates/masters-only presentations
+    Just el -> do
+      let sldIdElems = findChildren (elemName ns "p" "sldId") el
+      mapM (extractSlideRef ns) (zip [1..] sldIdElems)
+
+  return $ PresentationDoc
+    { presNameSpaces = ns
+    , presSlideSize = (width, height)
+    , presSlideIds = slideRefs
+    }
+
+-- | Extract slide ID and relationship ID from p:sldId element
+extractSlideRef :: NameSpaces -> (Int, Element) -> Either Text (SlideId, Text)
+extractSlideRef ns (idx, sldIdElem) = do
+  relId <- maybeToEither ("Missing r:id in slide " <> T.pack (show idx)) $
+           findAttrByName ns "r" "id" sldIdElem
+
+  return (SlideId idx, relId)
+
+-- | Safe read attribute as Integer (with default of 0)
+readAttrInt :: Text -> Element -> Integer
+readAttrInt attrName el =
+  case findAttr (unqual attrName) el of
+    Just str -> case readMaybe (T.unpack str) of
+      Just n -> n
+      Nothing -> 0
+    Nothing -> 0
+
+-- | Get presentation relationships path
+getPresentationRelsPath :: Archive -> FilePath -> Either Text FilePath
+getPresentationRelsPath _archive presPath =
+  -- ppt/presentation.xml → ppt/_rels/presentation.xml.rels
+  let (dir, file) = splitFileName presPath
+      relsPath = dir ++ "/_rels/" ++ file ++ ".rels"
+   in Right relsPath
+
+-- | Load relationships from .rels file
+loadRelationships :: Archive -> FilePath -> Either Text [(Text, Text)]
+loadRelationships archive relsPath =
+  case findEntryByPath relsPath archive of
+    Nothing -> Right []  -- No relationships is OK
+    Just entry -> do
+      relsElem <- parseXMLFromEntry entry
+      let relElems = onlyElems $ elContent relsElem
+      return $ mapMaybe extractRelationship relElems
+  where
+    extractRelationship el = do
+      relId <- findAttr (unqual "Id") el
+      target <- findAttr (unqual "Target") el
+      return (relId, target)
+
+-- | Parse a single slide
+parseSlide :: Archive -> [(Text, Text)] -> (SlideId, Text) -> Either Text PptxSlide
+parseSlide archive rels (sid, relId) = do
+  -- Resolve relationship to get slide path
+  target <- maybeToEither ("Relationship not found: " <> relId) $
+            lookup relId rels
+
+  -- Resolve relative path: ppt/slides/slide1.xml
+  let slidePath' = "ppt/" <> T.unpack target
+
+  -- Load and parse slide XML
+  slideElem <- loadXMLFromArchive archive slidePath'
+
+  -- Load slide-specific relationships
+  slideRelsPath <- getPresentationRelsPath archive slidePath'
+  slideRels' <- loadRelationships archive slideRelsPath
+
+  return $ PptxSlide sid slidePath' slideElem slideRels'
+
+-- | Helper: Maybe a -> Either Text a
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
diff --git a/src/Text/Pandoc/Readers/Pptx/Shapes.hs b/src/Text/Pandoc/Readers/Pptx/Shapes.hs
new file mode 100644
index 000000000..d316a7b22
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Shapes.hs
@@ -0,0 +1,330 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -Wno-partial-fields #-}
+{- |
+   Module      : Text.Pandoc.Readers.Pptx.Shapes
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Parsing of PPTX shapes (text boxes, images, tables, diagrams).
+-}
+module Text.Pandoc.Readers.Pptx.Shapes
+  ( PptxShape(..)
+  , PptxParagraph(..)
+  , BulletType(..)
+  , parseShapes
+  , parseShape
+  , shapeToBlocks
+  , isTitlePlaceholder
+  , extractDrawingMLText
+  ) where
+
+import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry)
+import qualified Data.ByteString.Lazy as B
+import Data.List (find, groupBy)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Read (readMaybe)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import qualified Text.Pandoc.Class.PandocMonad as P
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Pptx.SmartArt
+import Text.Pandoc.XML.Light
+
+-- | Paragraph with bullet/numbering information
+data PptxParagraph = PptxParagraph
+  { paraLevel   :: Int            -- Bullet level (0, 1, 2...)
+  , paraBullet  :: BulletType
+  , paraText    :: Text
+  } deriving (Show)
+
+-- | Bullet type
+data BulletType
+  = NoBullet
+  | Bullet                        -- Has bullet (character detected or implicit)
+  | WingdingsBullet              -- Detected via Wingdings symbol
+  deriving (Show, Eq)
+
+-- | Shape types in PPTX slides
+data PptxShape
+  = PptxTextBox [PptxParagraph]         -- Parsed paragraphs with bullet info
+  | PptxPicture
+      { picRelId  :: Text               -- Relationship ID (lazy loading)
+      , picTitle  :: Text
+      , picAlt    :: Text
+      }
+  | PptxTable [[Text]]                  -- Simple text cells for now
+  | PptxDiagramRef
+      { dgmDataRelId   :: Text          -- Relationship to data.xml
+      , dgmLayoutRelId :: Text          -- Relationship to layout.xml
+      }
+  | PptxGraphic Text                    -- Placeholder for other graphics
+  deriving (Show)
+
+-- | Parse all shapes from shape tree
+parseShapes :: NameSpaces -> Element -> [PptxShape]
+parseShapes ns spTreeElem =
+  let shapeElems = onlyElems $ elContent spTreeElem
+      -- Merge parent namespaces with element namespaces
+      ns' = ns <> elemToNameSpaces spTreeElem
+   in mapMaybe (parseShape ns') shapeElems
+
+-- | Parse individual shape element
+parseShape :: NameSpaces -> Element -> Maybe PptxShape
+parseShape ns el
+  -- Text box: <p:sp> with <p:txBody>
+  | isElem ns "p" "sp" el =
+      case findChildByName ns "p" "txBody" el of
+        Just txBody ->
+          let paras = parseParagraphs ns txBody
+           in if null paras
+              then Nothing
+              else Just $ PptxTextBox paras
+        Nothing -> Nothing
+
+  -- Picture: <p:pic>
+  | isElem ns "p" "pic" el = do
+      nvPicPr <- findChildByName ns "p" "nvPicPr" el
+      cNvPr <- findChildByName ns "p" "cNvPr" nvPicPr
+
+      let title = maybe "" id $ findAttr (unqual "name") cNvPr
+          alt = maybe "" id $ findAttr (unqual "descr") cNvPr
+
+      -- Get blip relationship ID
+      blipFill <- findChildByName ns "p" "blipFill" el
+      blip <- findChildByName ns "a" "blip" blipFill
+      relId <- findAttrByName ns "r" "embed" blip
+
+      return $ PptxPicture relId title alt
+
+  -- GraphicFrame: table or diagram
+  | isElem ns "p" "graphicFrame" el =
+      case findChildByName ns "a" "graphic" el >>=
+           findChildByName ns "a" "graphicData" of
+        Nothing -> Nothing
+        Just graphicData ->
+          case findAttr (unqual "uri") graphicData of
+            Nothing -> Just $ PptxGraphic "no-uri"
+            Just uri ->
+              if "table" `T.isInfixOf` uri
+                then
+                  -- Table
+                  case findChildByName ns "a" "tbl" graphicData of
+                    Just tbl ->
+                      let rows = parseTableRows ns tbl
+                       in Just $ PptxTable rows
+                    Nothing -> Nothing
+                else if "diagram" `T.isInfixOf` uri
+                  then
+                    -- SmartArt diagram - dgm namespace is declared inline on relIds element
+                    let dgmRelIds = find (\e -> qName (elName e) == "relIds") (elChildren graphicData)
+                     in case dgmRelIds of
+                          Nothing -> Just $ PptxGraphic "diagram-no-relIds"
+                          Just relIdsElem ->
+                            -- Get r:dm and r:lo attributes (r namespace is in parent)
+                            let ns' = ns <> elemToNameSpaces relIdsElem
+                             in case (findAttrByName ns' "r" "dm" relIdsElem,
+                                      findAttrByName ns' "r" "lo" relIdsElem) of
+                                  (Just dataRelId, Just layoutRelId) ->
+                                    Just $ PptxDiagramRef dataRelId layoutRelId
+                                  _ -> Just $ PptxGraphic "diagram-missing-rels"
+                  else
+                    -- Other graphic (chart, etc.)
+                    Just $ PptxGraphic ("other: " <> uri)
+
+  -- Skip other shapes for now
+  | otherwise = Nothing
+
+-- | Parse table rows (simple text extraction)
+parseTableRows :: NameSpaces -> Element -> [[Text]]
+parseTableRows ns tblElem =
+  let trElems = findChildrenByName ns "a" "tr" tblElem
+   in map (parseTableRow ns) trElems
+
+parseTableRow :: NameSpaces -> Element -> [Text]
+parseTableRow ns trElem =
+  let tcElems = findChildrenByName ns "a" "tc" trElem
+   in map extractCellText tcElems
+  where
+    extractCellText tcElem =
+      -- Get text from txBody/a:p/a:r/a:t
+      case findChildByName ns "a" "txBody" tcElem of
+        Just txBody -> extractDrawingMLText txBody
+        Nothing -> ""
+
+-- | Convert shape to Pandoc blocks
+shapeToBlocks :: PandocMonad m => Archive -> [(Text, Text)] -> PptxShape -> m [Block]
+shapeToBlocks _archive _rels (PptxTextBox paras) =
+  return $ paragraphsToBlocks paras
+shapeToBlocks archive rels (PptxPicture relId title alt) = do
+  -- Resolve relationship to get media path
+  case lookup relId rels of
+    Nothing -> return []  -- Image not found
+    Just target -> do
+      let mediaPath = resolveMediaPath target
+
+      -- Load image bytes and add to MediaBag
+      case loadMediaFromArchive archive mediaPath of
+        Nothing -> return []
+        Just mediaBytes -> do
+          P.insertMedia (T.unpack mediaPath) Nothing mediaBytes
+
+          let altText = if T.null alt then [] else [Str alt]
+          return [Para [Image nullAttr altText (mediaPath, title)]]
+
+shapeToBlocks _archive _rels (PptxTable rows) =
+  -- Simple table representation for now
+  case rows of
+    [] -> return []
+    (headerRow:bodyRows) -> do
+      let makeCell text = Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str text]]
+          headerCells = map makeCell headerRow
+          bodyCells = map (map makeCell) bodyRows
+          caption = Caption Nothing []
+          colSpec = replicate (length headerRow) (AlignDefault, ColWidthDefault)
+          headerRow' = Row nullAttr headerCells
+          bodyRows' = map (Row nullAttr) bodyCells
+          thead = TableHead nullAttr [headerRow']
+          tbody = [TableBody nullAttr 0 [] bodyRows']
+          tfoot = TableFoot nullAttr []
+      return [Table nullAttr caption colSpec thead tbody tfoot]
+
+shapeToBlocks archive rels (PptxDiagramRef dataRelId layoutRelId) = do
+  -- Parse SmartArt diagram
+  case parseDiagram archive rels dataRelId layoutRelId of
+    Left err -> do
+      -- Failed to parse diagram, return placeholder
+      return [Para [Str $ "[Diagram parse error: " <> err <> "]"]]
+    Right diagram ->
+      return $ diagramToBlocks diagram
+shapeToBlocks _archive _rels (PptxGraphic text) =
+  -- Placeholder for other graphics (charts, etc.)
+  return [Para [Str $ "[Graphic: " <> text <> "]"]]
+
+-- | Resolve media path (handle relative paths)
+resolveMediaPath :: Text -> Text
+resolveMediaPath target =
+  if "../media/" `T.isPrefixOf` target
+    then "ppt/media/" <> T.drop 9 target  -- "../media/" = 9 chars
+    else if "media/" `T.isPrefixOf` target
+      then "ppt/" <> target
+      else target
+
+-- | Load media file from archive
+loadMediaFromArchive :: Archive -> Text -> Maybe B.ByteString
+loadMediaFromArchive archive path =
+  case findEntryByPath (T.unpack path) archive of
+    Just entry -> Just $ fromEntry entry
+    Nothing -> Nothing
+
+-- | Parse paragraphs from text box
+parseParagraphs :: NameSpaces -> Element -> [PptxParagraph]
+parseParagraphs ns txBody =
+  let pElems = findChildrenByName ns "a" "p" txBody
+   in map (parseParagraph ns) pElems
+
+-- | Parse individual paragraph
+parseParagraph :: NameSpaces -> Element -> PptxParagraph
+parseParagraph ns pElem =
+  let level = parseBulletLevel ns pElem
+      bullet = detectBulletType ns pElem
+      text = extractParagraphText ns pElem
+   in PptxParagraph level bullet text
+
+-- | Parse bullet level from paragraph properties
+parseBulletLevel :: NameSpaces -> Element -> Int
+parseBulletLevel ns pElem =
+  case findChildByName ns "a" "pPr" pElem >>=
+       findAttr (unqual "lvl") >>=
+       (\s -> readMaybe (T.unpack s) :: Maybe Int) of
+    Just lvl -> lvl
+    Nothing -> 0  -- Default to level 0
+
+-- | Detect bullet type
+detectBulletType :: NameSpaces -> Element -> BulletType
+detectBulletType ns pElem =
+  -- Check for explicit <a:pPr><a:buChar>
+  case findChildByName ns "a" "pPr" pElem >>=
+       findChildByName ns "a" "buChar" of
+    Just _buCharElem -> Bullet
+    Nothing ->
+      -- Check for Wingdings symbol (common in PowerPoint)
+      if hasWingdingsSymbol ns pElem
+        then WingdingsBullet
+        else NoBullet
+
+-- | Check if paragraph starts with Wingdings symbol
+hasWingdingsSymbol :: NameSpaces -> Element -> Bool
+hasWingdingsSymbol ns pElem =
+  let runs = findChildrenByName ns "a" "r" pElem
+      checkRun r = case findChildByName ns "a" "rPr" r >>=
+                        findChildByName ns "a" "sym" of
+                     Just symElem ->
+                       case findAttr (unqual "typeface") symElem of
+                         Just typeface -> "Wingdings" `T.isInfixOf` typeface
+                         Nothing -> False
+                     Nothing -> False
+   in any checkRun runs
+
+-- | Extract text from paragraph
+extractParagraphText :: NameSpaces -> Element -> Text
+extractParagraphText _ns pElem =
+  -- Find all <a:t> elements and concatenate
+  let textElems = filterElementsName (\qn -> qName qn == "t") pElem
+      texts = map strContent textElems
+   in T.unwords $ filter (not . T.null) texts
+
+-- | Extract text from DrawingML element (finds all <a:t> descendants)
+extractDrawingMLText :: Element -> Text
+extractDrawingMLText el =
+  let textElems = filterElementsName (\qn -> qName qn == "t") el
+      texts = map strContent textElems
+   in T.unwords $ filter (not . T.null) texts
+
+-- | Convert paragraphs to blocks, grouping bullets into lists
+paragraphsToBlocks :: [PptxParagraph] -> [Block]
+paragraphsToBlocks paras =
+  -- If we have multiple paragraphs with bullets, group them
+  let hasBullets = any (\p -> paraBullet p /= NoBullet) paras
+   in if hasBullets
+      then groupBulletParagraphs paras
+      else map (\p -> Para [Str $ paraText p]) paras
+
+-- | Group bullet paragraphs into lists
+groupBulletParagraphs :: [PptxParagraph] -> [Block]
+groupBulletParagraphs paras =
+  let grouped = groupBy sameBulletLevel paras
+   in concatMap groupToBlock grouped
+  where
+    sameBulletLevel p1 p2 =
+      (paraBullet p1 /= NoBullet) &&
+      (paraBullet p2 /= NoBullet) &&
+      (paraLevel p1 == paraLevel p2)
+
+    groupToBlock :: [PptxParagraph] -> [Block]
+    groupToBlock [] = []
+    groupToBlock ps@(p:_)
+      | paraBullet p /= NoBullet =
+          -- Bullet list
+          let items = map (\para -> [Plain [Str $ paraText para]]) ps
+           in [BulletList items]
+      | otherwise =
+          -- Plain paragraph
+          map (\para -> Para [Str $ paraText para]) ps
+
+-- | Check if shape is title placeholder (also used in Slides module)
+isTitlePlaceholder :: NameSpaces -> Element -> Bool
+isTitlePlaceholder ns el =
+  case findChildByName ns "p" "nvSpPr" el >>=
+       findChildByName ns "p" "nvPr" >>=
+       findChildByName ns "p" "ph" of
+    Just phElem ->
+      case findAttr (unqual "type") phElem of
+        Just phType -> phType == "title" || phType == "ctrTitle"
+        Nothing -> False
+    Nothing -> False
diff --git a/src/Text/Pandoc/Readers/Pptx/Slides.hs b/src/Text/Pandoc/Readers/Pptx/Slides.hs
new file mode 100644
index 000000000..5e24af26f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/Slides.hs
@@ -0,0 +1,83 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Pptx.Slides
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of PPTX slides to Pandoc AST blocks.
+-}
+module Text.Pandoc.Readers.Pptx.Slides
+  ( pptxToOutput
+  ) where
+
+import Codec.Archive.Zip (Archive)
+import Data.List (find)
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.Readers.Pptx.Parse
+import Text.Pandoc.Readers.Pptx.Shapes
+import Text.Pandoc.XML.Light
+
+-- | Convert Pptx intermediate representation to Pandoc AST
+pptxToOutput :: PandocMonad m => ReaderOptions -> Pptx -> m (Meta, [Block])
+pptxToOutput _opts pptx = do
+  let slides = pptxSlides pptx
+      archive = pptxArchive pptx
+
+  -- Convert each slide to blocks
+  slideBlocks <- concat <$> mapM (slideToBlocks archive) slides
+
+  return (mempty, slideBlocks)
+
+-- | Convert slide to blocks
+slideToBlocks :: PandocMonad m => Archive -> PptxSlide -> m [Block]
+slideToBlocks archive slide = do
+  let SlideId n = slideId slide
+      slideElem = slideElement slide
+      rels = slideRels slide
+      ns = elemToNameSpaces slideElem
+
+      -- Extract title from title placeholder
+      title = extractSlideTitle ns slideElem
+
+      -- Create header
+      slideIdent = "slide-" <> T.pack (show n)
+      headerText = if T.null title
+                   then "Slide " <> T.pack (show n)
+                   else title
+      header = Header 2 (slideIdent, [], []) [Str headerText]
+
+  -- Parse shapes and convert to blocks
+  case findChildByName ns "p" "cSld" slideElem >>=
+       findChildByName ns "p" "spTree" of
+    Nothing -> return [header]
+    Just spTree -> do
+      -- Filter out title placeholder shapes before parsing
+      let allShapeElems = onlyElems $ elContent spTree
+          nonTitleShapeElems = filter (not . isTitlePlaceholder ns) allShapeElems
+          shapes = mapMaybe (parseShape ns) nonTitleShapeElems
+      shapeBlocks <- concat <$> mapM (shapeToBlocks archive rels) shapes
+      return $ header : shapeBlocks
+
+-- | Extract title from title placeholder
+extractSlideTitle :: NameSpaces -> Element -> Text
+extractSlideTitle ns slideElem =
+  case findChildByName ns "p" "cSld" slideElem >>=
+       findChildByName ns "p" "spTree" of
+    Nothing -> ""
+    Just spTree ->
+      -- Find shape with ph type="title"
+      let shapes = onlyElems $ elContent spTree
+          titleShape = find (isTitlePlaceholder ns) shapes
+       in maybe "" extractDrawingMLText titleShape
+
+-- isTitlePlaceholder is imported from Shapes module
diff --git a/src/Text/Pandoc/Readers/Pptx/SmartArt.hs b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs
new file mode 100644
index 000000000..64e4a8649
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Pptx/SmartArt.hs
@@ -0,0 +1,220 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Pptx.SmartArt
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+SmartArt diagram parsing and text extraction for PPTX.
+-}
+module Text.Pandoc.Readers.Pptx.SmartArt
+  ( PptxDiagram(..)
+  , parseDiagram
+  , diagramToBlocks
+  ) where
+
+import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry)
+import qualified Data.Map.Strict as M
+import Data.Maybe (mapMaybe)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Encoding as TL
+import Data.Text (Text)
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.OOXML.Shared
+import Text.Pandoc.XML.Light
+
+-- | SmartArt diagram data
+data PptxDiagram = PptxDiagram
+  { diagramType :: Text               -- Layout type (chevron, cycle, etc.)
+  , diagramNodes :: [(Text, [Text])]  -- (nodeText, childTexts)
+  } deriving (Show)
+
+-- | Parse SmartArt diagram from relationship IDs
+parseDiagram :: Archive
+             -> [(Text, Text)]  -- Slide relationships
+             -> Text            -- data relationship ID
+             -> Text            -- layout relationship ID
+             -> Either Text PptxDiagram
+parseDiagram archive rels dataRelId layoutRelId = do
+  -- Resolve relationships to file paths
+  dataTarget <- maybeToEither ("Relationship not found: " <> dataRelId) $
+                lookup dataRelId rels
+  layoutTarget <- maybeToEither ("Relationship not found: " <> layoutRelId) $
+                  lookup layoutRelId rels
+
+  -- Resolve relative paths (diagrams are in ../diagrams/ from slides/)
+  let dataPath = resolveDiagramPath dataTarget
+      layoutPath = resolveDiagramPath layoutTarget
+
+  -- Load XML files
+  dataElem <- loadXMLFromArchive archive dataPath
+  layoutElem <- loadXMLFromArchive archive layoutPath
+
+  -- Extract layout type
+  layoutType <- extractLayoutType layoutElem
+
+  -- Extract text nodes with hierarchy
+  nodes <- extractDiagramNodes dataElem
+
+  return $ PptxDiagram layoutType nodes
+
+-- | Resolve diagram path (handle ../diagrams/ relative paths)
+resolveDiagramPath :: Text -> FilePath
+resolveDiagramPath target =
+  if "../diagrams/" `T.isPrefixOf` target
+    then "ppt/diagrams/" ++ T.unpack (T.drop 12 target)  -- "../diagrams/" = 12 chars
+    else T.unpack target
+
+-- | Load XML from archive
+loadXMLFromArchive :: Archive -> FilePath -> Either Text Element
+loadXMLFromArchive archive path =
+  case findEntryByPath path archive of
+    Nothing -> Left $ "File not found in archive: " <> T.pack path
+    Just entry ->
+      let xmlBytes = fromEntry entry
+          lazyText = TL.decodeUtf8 xmlBytes
+       in parseXMLElement lazyText
+
+-- | Extract layout type from layout XML
+extractLayoutType :: Element -> Either Text Text
+extractLayoutType layoutElem = do
+  -- Look for uniqueId attribute: "urn:.../layout/chevron2"
+  case findAttr (unqual "uniqueId") layoutElem of
+    Just uid ->
+      -- Extract last part after last /
+      let layoutName = T.takeWhileEnd (/= '/') uid
+       in Right layoutName
+    Nothing ->
+      -- Fallback: look for title
+      case findChildByName ns "dgm" "title" layoutElem >>=
+           findAttr (unqual "val") of
+        Just title -> Right title
+        Nothing -> Right "unknown"
+  where
+    ns = elemToNameSpaces layoutElem
+
+-- | Extract text nodes from diagram data
+extractDiagramNodes :: Element -> Either Text [(Text, [Text])]
+extractDiagramNodes dataElem = do
+  let ns = elemToNameSpaces dataElem
+
+  -- Find point list
+  ptLst <- maybeToEither "Missing dgm:ptLst" $
+           findChildByName ns "dgm" "ptLst" dataElem
+
+  let ptElems = findChildrenByName ns "dgm" "pt" ptLst
+
+  -- Build node map: modelId → text
+  let nodeMap = M.fromList $ mapMaybe (extractNodeText ns) ptElems
+
+  -- Parse connections
+  let cxnLst = findChildByName ns "dgm" "cxnLst" dataElem
+      connections = maybe [] (parseConnections ns) cxnLst
+
+  -- Build parent-child map
+  let parentMap = buildParentMap connections
+
+  -- Find parent nodes (nodes that have children)
+  let parentIds = M.keys parentMap
+
+  -- Build hierarchy - only show nodes that are parents
+  -- (children are shown under their parents)
+  let hierarchy = map (buildNodeWithChildren nodeMap parentMap) parentIds
+      -- Filter out nodes with empty text (presentation nodes)
+      validHierarchy = filter (\(nodeText, _) -> not $ T.null nodeText) hierarchy
+
+  return validHierarchy
+
+-- | Extract text from a point element (returns Nothing if no text)
+extractNodeText :: NameSpaces -> Element -> Maybe (Text, Text)
+extractNodeText ns ptElem = do
+  modelId <- findAttr (unqual "modelId") ptElem
+
+  -- Extract text from dgm:t element (which contains a:p/a:r/a:t)
+  let text = case findChildByName ns "dgm" "t" ptElem of
+        Just tElem ->
+          -- Recursively get ALL text content from all descendants
+          getAllText tElem
+        Nothing -> ""
+
+  -- Only return nodes with actual text
+  if T.null (T.strip text)
+    then Nothing
+    else return (modelId, text)
+
+-- | Connection between nodes
+data Connection = Connection
+  { connType :: Text
+  , connSrc  :: Text
+  , connDest :: Text
+  } deriving (Show)
+
+-- | Parse connections
+parseConnections :: NameSpaces -> Element -> [Connection]
+parseConnections ns cxnLst =
+  let cxnElems = findChildrenByName ns "dgm" "cxn" cxnLst
+   in mapMaybe (parseConnection ns) cxnElems
+
+parseConnection :: NameSpaces -> Element -> Maybe Connection
+parseConnection _ns cxnElem = do
+  let cxnType = maybe "" id $ findAttr (unqual "type") cxnElem  -- Empty if no type
+  srcId <- findAttr (unqual "srcId") cxnElem
+  destId <- findAttr (unqual "destId") cxnElem
+  return $ Connection cxnType srcId destId
+
+-- | Build parent-child map from connections
+-- Use connections WITHOUT a type attribute (these are the data hierarchy)
+buildParentMap :: [Connection] -> M.Map Text [Text]
+buildParentMap connections =
+  let dataConnections = filter (\c -> T.null (connType c)) connections
+   in foldr addConn M.empty dataConnections
+  where
+    addConn conn m = M.insertWith (++) (connSrc conn) [connDest conn] m
+
+-- | Build node with its children
+buildNodeWithChildren :: M.Map Text Text -> M.Map Text [Text] -> Text -> (Text, [Text])
+buildNodeWithChildren nodeMap parentMap nodeId =
+  let nodeText = M.findWithDefault "" nodeId nodeMap
+      childIds = M.findWithDefault [] nodeId parentMap
+      -- Only include children that have text
+      childTexts = filter (not . T.null) $
+                   map (\cid -> M.findWithDefault "" cid nodeMap) childIds
+   in (nodeText, childTexts)
+
+-- | Convert diagram to Pandoc blocks
+diagramToBlocks :: PptxDiagram -> [Block]
+diagramToBlocks diagram =
+  let nodes = diagramNodes diagram
+      layoutType = diagramType diagram
+
+      -- Build content blocks
+      contentBlocks = concatMap nodeToBlocks nodes
+
+   in [Div ("", ["smartart", layoutType], [("layout", layoutType)])
+           contentBlocks]
+
+-- | Convert node to blocks
+nodeToBlocks :: (Text, [Text]) -> [Block]
+nodeToBlocks (nodeText, childTexts) =
+  if null childTexts
+    then [Para [Strong [Str nodeText]]]
+    else [ Para [Strong [Str nodeText]]
+         , BulletList [[Plain [Str child]] | child <- childTexts]
+         ]
+
+-- | Recursively extract all text from an element and its descendants
+getAllText :: Element -> Text
+getAllText el =
+  let textFromContent (Text cdata) = cdData cdata
+      textFromContent (Elem e) = getAllText e
+      textFromContent _ = ""
+      texts = map textFromContent (elContent el)
+   in T.unwords $ filter (not . T.null) texts
+
+-- Helper functions
+maybeToEither :: Text -> Maybe a -> Either Text a
+maybeToEither err Nothing = Left err
+maybeToEither _ (Just x) = Right x
diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs
new file mode 100644
index 000000000..613d5b50f
--- /dev/null
+++ b/test/Tests/Readers/Pptx.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Tests.Readers.Pptx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Tests for the PPTX reader.
+-}
+module Tests.Readers.Pptx (tests) where
+
+import Data.Algorithm.Diff (getDiff)
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Test.Tasty
+import Test.Tasty.Golden.Advanced
+import Tests.Helpers
+import Text.Pandoc
+import Text.Pandoc.UTF8 as UTF8
+
+defopts :: ReaderOptions
+defopts = def{ readerExtensions = getDefaultExtensions "pptx" }
+
+testCompare :: String -> FilePath -> FilePath -> TestTree
+testCompare = testCompareWithOpts defopts
+
+nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String)
+nativeDiff normPath expectedNative actualNative
+  | expectedNative == actualNative = return Nothing
+  | otherwise = Just <$> do
+      expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative)
+      actual <- T.unpack <$> runIOorExplode (writeNative def actualNative)
+      let dash = replicate 72 '-'
+      let diff = getDiff (lines actual) (lines expected)
+      return $ '\n' : dash ++
+               "\n--- " ++ normPath ++
+               "\n+++ " ++ "test" ++ "\n" ++
+               showDiff (1,1) diff ++ dash
+
+testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
+testCompareWithOpts opts testName pptxFP nativeFP =
+  goldenTest
+  testName
+  (do nf <- UTF8.toText <$> BS.readFile nativeFP
+      runIOorExplode (readNative def nf))
+  (do df <- B.readFile pptxFP
+      runIOorExplode (readPptx opts df))
+  (nativeDiff nativeFP)
+  (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a)
+            >>= BS.writeFile nativeFP . UTF8.fromText)
+
+tests :: [TestTree]
+tests = [ testGroup "basic"
+          [ testCompare
+            "text extraction"
+            "pptx-reader/basic.pptx"
+            "pptx-reader/basic.native"
+          ]
+        ]
diff --git a/test/pptx-reader/basic.native b/test/pptx-reader/basic.native
new file mode 100644
index 000000000..954cb9345
--- /dev/null
+++ b/test/pptx-reader/basic.native
@@ -0,0 +1,149 @@
+[ Header 2 ( "slide-1" , [] , [] ) [ Str "LLMs" ]
+, BulletList
+    [ [ Plain
+          [ Str
+              "Provider  \61664 Available LLMs \8211 who manages? How?"
+          ]
+      ]
+    , [ Plain
+          [ Str
+              "EW maintained list of \8220approved\8221 LLMs for Universal workers"
+          ]
+      ]
+    , [ Plain
+          [ Str
+              "Rebuilding of UWs to the \8220Newgen\8221 thing completely"
+          ]
+      ]
+    , [ Plain [ Str "Streaming support" ] ]
+    , [ Plain [ Str "Multimodal (voice streaming) models?" ] ]
+    ]
+, Header
+    2
+    ( "slide-2" , [] , [] )
+    [ Str "Everworker   venn   diagram" ]
+, Para [ Str "SKILLS" ]
+, Para [ Str "" ]
+, Para [ Str "Specialized Workers / Workflows:" ]
+, Para [ Str "" ]
+, Para [ Str "n8n, UI Path, " ]
+, Para [ Str "other RPA" ]
+, Para [ Str "BRAINS" ]
+, Para [ Str "" ]
+, Para [ Str "Universal Workers / AI Agents:" ]
+, Para [ Str "" ]
+, Para [ Str "openai , anthropic," ]
+, Para [ Str "Crew AI, other " ]
+, Para [ Str "\8220AI natives\8221" ]
+, Para [ Str "KNOWLEDGE " ]
+, Para [ Str "" ]
+, Para [ Str "Data / " ]
+, Para [ Str "RAG Pipelines" ]
+, Para [ Str "" ]
+, Para
+    [ Str "Vector DBs, specialized data prep vendors, \8230" ]
+, Para [ Str "glean" ]
+, Para [ Str "EW" ]
+, Header 2 ( "slide-3" , [] , [] ) [ Str "Table" ]
+, Table
+    ( "" , [] , [] )
+    (Caption Nothing [])
+    [ ( AlignDefault , ColWidthDefault )
+    , ( AlignDefault , ColWidthDefault )
+    , ( AlignDefault , ColWidthDefault )
+    ]
+    (TableHead
+       ( "" , [] , [] )
+       [ Row
+           ( "" , [] , [] )
+           [ Cell
+               ( "" , [] , [] )
+               AlignDefault
+               (RowSpan 1)
+               (ColSpan 1)
+               [ Plain [ Str "Col1" ] ]
+           , Cell
+               ( "" , [] , [] )
+               AlignDefault
+               (RowSpan 1)
+               (ColSpan 1)
+               [ Plain [ Str "Col2" ] ]
+           , Cell
+               ( "" , [] , [] )
+               AlignDefault
+               (RowSpan 1)
+               (ColSpan 1)
+               [ Plain [ Str "Col3" ] ]
+           ]
+       ])
+    [ TableBody
+        ( "" , [] , [] )
+        (RowHeadColumns 0)
+        []
+        [ Row
+            ( "" , [] , [] )
+            [ Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "Name" ] ]
+            , Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "Anton" ] ]
+            , Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "Antich" ] ]
+            ]
+        , Row
+            ( "" , [] , [] )
+            [ Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "Age" ] ]
+            , Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "23" ] ]
+            , Cell
+                ( "" , [] , [] )
+                AlignDefault
+                (RowSpan 1)
+                (ColSpan 1)
+                [ Plain [ Str "years" ] ]
+            ]
+        ]
+    ]
+    (TableFoot ( "" , [] , [] ) [])
+, Para
+    [ Image
+        ( "" , [] , [] ) [] ( "ppt/media/image1.png" , "Picture 6" )
+    ]
+, Header 2 ( "slide-4" , [] , [] ) [ Str "Smart Art" ]
+, Div
+    ( ""
+    , [ "smartart" , "chevron2" ]
+    , [ ( "layout" , "chevron2" ) ]
+    )
+    [ Para [ Strong [ Str "First" ] ]
+    , BulletList
+        [ [ Plain [ Str "another" ] ]
+        , [ Plain [ Str "subtitle" ] ]
+        ]
+    , Para [ Strong [ Str "Second" ] ]
+    , BulletList
+        [ [ Plain [ Str "and yet again" ] ]
+        , [ Plain [ Str "yet more" ] ]
+        ]
+    ]
+]
diff --git a/test/pptx-reader/basic.pptx b/test/pptx-reader/basic.pptx
new file mode 100644
index 000000000..44caef9c3
--- /dev/null
+++ b/test/pptx-reader/basic.pptx
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index 80d4ada7f..0d04b361f 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -12,6 +12,7 @@ import qualified Tests.Command
 import qualified Tests.Old
 import qualified Tests.Readers.Creole
 import qualified Tests.Readers.Docx
+import qualified Tests.Readers.Pptx
 import qualified Tests.Readers.DokuWiki
 import qualified Tests.Readers.EPUB
 import qualified Tests.Readers.FB2
@@ -95,6 +96,7 @@ tests pandocPath = testGroup "pandoc tests"
           , testGroup "RST" Tests.Readers.RST.tests
           , testGroup "RTF" Tests.Readers.RTF.tests
           , testGroup "Docx" Tests.Readers.Docx.tests
+          , testGroup "Pptx" Tests.Readers.Pptx.tests
           , testGroup "ODT" Tests.Readers.ODT.tests
           , testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests
           , testGroup "EPUB" Tests.Readers.EPUB.tests
author	Anton Antich <[email protected]>	2025-11-09 13:48:14 +0100
committer	John MacFarlane <[email protected]>	2025-11-24 22:37:39 +0100
commit	ec75b693e5618c12ddac872d48e084436f1e1b48 (patch)
tree	942edb1022128c5f769b928ee5ba3b0a6ee6b4f7
parent	5d3b2916d616902d205146c108629053516fd9f4 (diff)