From ec75b693e5618c12ddac872d48e084436f1e1b48 Mon Sep 17 00:00:00 2001 From: Anton Antich Date: Sun, 9 Nov 2025 13:48:14 +0100 Subject: Support pptx (PowerPoint) as an input format. New module `Text.Pandoc.Readers.Pptx`, exporting `readPptx`. [API change] Factored out some common OOXML functions from Text.Pandoc.Readers.Docx.Util into a non-exported module Text.Pandoc.Readers.OOXML.Shared. --- test/Tests/Readers/Pptx.hs | 63 ++++++++++++++++++ test/pptx-reader/basic.native | 149 ++++++++++++++++++++++++++++++++++++++++++ test/pptx-reader/basic.pptx | Bin 0 -> 111674 bytes test/test-pandoc.hs | 2 + 4 files changed, 214 insertions(+) create mode 100644 test/Tests/Readers/Pptx.hs create mode 100644 test/pptx-reader/basic.native create mode 100644 test/pptx-reader/basic.pptx (limited to 'test') diff --git a/test/Tests/Readers/Pptx.hs b/test/Tests/Readers/Pptx.hs new file mode 100644 index 000000000..613d5b50f --- /dev/null +++ b/test/Tests/Readers/Pptx.hs @@ -0,0 +1,63 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Pptx + Copyright : © 2025 Anton Antic + License : GNU GPL, version 2 or above + + Maintainer : Anton Antic + Stability : alpha + Portability : portable + +Tests for the PPTX reader. +-} +module Tests.Readers.Pptx (tests) where + +import Data.Algorithm.Diff (getDiff) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Test.Tasty +import Test.Tasty.Golden.Advanced +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.UTF8 as UTF8 + +defopts :: ReaderOptions +defopts = def{ readerExtensions = getDefaultExtensions "pptx" } + +testCompare :: String -> FilePath -> FilePath -> TestTree +testCompare = testCompareWithOpts defopts + +nativeDiff :: FilePath -> Pandoc -> Pandoc -> IO (Maybe String) +nativeDiff normPath expectedNative actualNative + | expectedNative == actualNative = return Nothing + | otherwise = Just <$> do + expected <- T.unpack <$> runIOorExplode (writeNative def expectedNative) + actual <- T.unpack <$> runIOorExplode (writeNative def actualNative) + let dash = replicate 72 '-' + let diff = getDiff (lines actual) (lines expected) + return $ '\n' : dash ++ + "\n--- " ++ normPath ++ + "\n+++ " ++ "test" ++ "\n" ++ + showDiff (1,1) diff ++ dash + +testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree +testCompareWithOpts opts testName pptxFP nativeFP = + goldenTest + testName + (do nf <- UTF8.toText <$> BS.readFile nativeFP + runIOorExplode (readNative def nf)) + (do df <- B.readFile pptxFP + runIOorExplode (readPptx opts df)) + (nativeDiff nativeFP) + (\a -> runIOorExplode (writeNative def{ writerTemplate = Just mempty} a) + >>= BS.writeFile nativeFP . UTF8.fromText) + +tests :: [TestTree] +tests = [ testGroup "basic" + [ testCompare + "text extraction" + "pptx-reader/basic.pptx" + "pptx-reader/basic.native" + ] + ] diff --git a/test/pptx-reader/basic.native b/test/pptx-reader/basic.native new file mode 100644 index 000000000..954cb9345 --- /dev/null +++ b/test/pptx-reader/basic.native @@ -0,0 +1,149 @@ +[ Header 2 ( "slide-1" , [] , [] ) [ Str "LLMs" ] +, BulletList + [ [ Plain + [ Str + "Provider \61664 Available LLMs \8211 who manages? How?" + ] + ] + , [ Plain + [ Str + "EW maintained list of \8220approved\8221 LLMs for Universal workers" + ] + ] + , [ Plain + [ Str + "Rebuilding of UWs to the \8220Newgen\8221 thing completely" + ] + ] + , [ Plain [ Str "Streaming support" ] ] + , [ Plain [ Str "Multimodal (voice streaming) models?" ] ] + ] +, Header + 2 + ( "slide-2" , [] , [] ) + [ Str "Everworker venn diagram" ] +, Para [ Str "SKILLS" ] +, Para [ Str "" ] +, Para [ Str "Specialized Workers / Workflows:" ] +, Para [ Str "" ] +, Para [ Str "n8n, UI Path, " ] +, Para [ Str "other RPA" ] +, Para [ Str "BRAINS" ] +, Para [ Str "" ] +, Para [ Str "Universal Workers / AI Agents:" ] +, Para [ Str "" ] +, Para [ Str "openai , anthropic," ] +, Para [ Str "Crew AI, other " ] +, Para [ Str "\8220AI natives\8221" ] +, Para [ Str "KNOWLEDGE " ] +, Para [ Str "" ] +, Para [ Str "Data / " ] +, Para [ Str "RAG Pipelines" ] +, Para [ Str "" ] +, Para + [ Str "Vector DBs, specialized data prep vendors, \8230" ] +, Para [ Str "glean" ] +, Para [ Str "EW" ] +, Header 2 ( "slide-3" , [] , [] ) [ Str "Table" ] +, Table + ( "" , [] , [] ) + (Caption Nothing []) + [ ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + , ( AlignDefault , ColWidthDefault ) + ] + (TableHead + ( "" , [] , [] ) + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col1" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col2" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Col3" ] ] + ] + ]) + [ TableBody + ( "" , [] , [] ) + (RowHeadColumns 0) + [] + [ Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Name" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Anton" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Antich" ] ] + ] + , Row + ( "" , [] , [] ) + [ Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "Age" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "23" ] ] + , Cell + ( "" , [] , [] ) + AlignDefault + (RowSpan 1) + (ColSpan 1) + [ Plain [ Str "years" ] ] + ] + ] + ] + (TableFoot ( "" , [] , [] ) []) +, Para + [ Image + ( "" , [] , [] ) [] ( "ppt/media/image1.png" , "Picture 6" ) + ] +, Header 2 ( "slide-4" , [] , [] ) [ Str "Smart Art" ] +, Div + ( "" + , [ "smartart" , "chevron2" ] + , [ ( "layout" , "chevron2" ) ] + ) + [ Para [ Strong [ Str "First" ] ] + , BulletList + [ [ Plain [ Str "another" ] ] + , [ Plain [ Str "subtitle" ] ] + ] + , Para [ Strong [ Str "Second" ] ] + , BulletList + [ [ Plain [ Str "and yet again" ] ] + , [ Plain [ Str "yet more" ] ] + ] + ] +] diff --git a/test/pptx-reader/basic.pptx b/test/pptx-reader/basic.pptx new file mode 100644 index 000000000..44caef9c3 Binary files /dev/null and b/test/pptx-reader/basic.pptx differ diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index 80d4ada7f..0d04b361f 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -12,6 +12,7 @@ import qualified Tests.Command import qualified Tests.Old import qualified Tests.Readers.Creole import qualified Tests.Readers.Docx +import qualified Tests.Readers.Pptx import qualified Tests.Readers.DokuWiki import qualified Tests.Readers.EPUB import qualified Tests.Readers.FB2 @@ -95,6 +96,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "RST" Tests.Readers.RST.tests , testGroup "RTF" Tests.Readers.RTF.tests , testGroup "Docx" Tests.Readers.Docx.tests + , testGroup "Pptx" Tests.Readers.Pptx.tests , testGroup "ODT" Tests.Readers.ODT.tests , testGroup "Txt2Tags" Tests.Readers.Txt2Tags.tests , testGroup "EPUB" Tests.Readers.EPUB.tests -- cgit v1.2.3