1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
|
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -Wno-partial-fields #-}
{- |
Module : Text.Pandoc.Readers.Pptx.Shapes
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above
Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable
Parsing of PPTX shapes (text boxes, images, tables, diagrams).
-}
module Text.Pandoc.Readers.Pptx.Shapes
( PptxShape(..)
, PptxParagraph(..)
, BulletType(..)
, parseShapes
, parseShape
, shapeToBlocks
, isTitlePlaceholder
, extractDrawingMLText
) where
import Codec.Archive.Zip (Archive, findEntryByPath, fromEntry)
import qualified Data.ByteString.Lazy as B
import Data.List (find, groupBy)
import Data.Maybe (mapMaybe)
import qualified Data.Text as T
import Data.Text (Text)
import Text.Read (readMaybe)
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
import Text.Pandoc.Readers.OOXML.Shared
import Text.Pandoc.Readers.Pptx.SmartArt
import Text.Pandoc.XML.Light
-- | Paragraph with bullet/numbering information
data PptxParagraph = PptxParagraph
{ paraLevel :: Int -- Bullet level (0, 1, 2...)
, paraBullet :: BulletType
, paraText :: Text
} deriving (Show)
-- | Bullet type
data BulletType
= NoBullet
| Bullet -- Has bullet (character detected or implicit)
| WingdingsBullet -- Detected via Wingdings symbol
deriving (Show, Eq)
-- | Shape types in PPTX slides
data PptxShape
= PptxTextBox [PptxParagraph] -- Parsed paragraphs with bullet info
| PptxPicture
{ picRelId :: Text -- Relationship ID (lazy loading)
, picTitle :: Text
, picAlt :: Text
}
| PptxTable [[Text]] -- Simple text cells for now
| PptxDiagramRef
{ dgmDataRelId :: Text -- Relationship to data.xml
, dgmLayoutRelId :: Text -- Relationship to layout.xml
}
| PptxGraphic Text -- Placeholder for other graphics
deriving (Show)
-- | Parse all shapes from shape tree
parseShapes :: NameSpaces -> Element -> [PptxShape]
parseShapes ns spTreeElem =
let shapeElems = onlyElems $ elContent spTreeElem
-- Merge parent namespaces with element namespaces
ns' = ns <> elemToNameSpaces spTreeElem
in mapMaybe (parseShape ns') shapeElems
-- | Parse individual shape element
parseShape :: NameSpaces -> Element -> Maybe PptxShape
parseShape ns el
-- Text box: <p:sp> with <p:txBody>
| isElem ns "p" "sp" el =
case findChildByName ns "p" "txBody" el of
Just txBody ->
let paras = parseParagraphs ns txBody
in if null paras
then Nothing
else Just $ PptxTextBox paras
Nothing -> Nothing
-- Picture: <p:pic>
| isElem ns "p" "pic" el = do
nvPicPr <- findChildByName ns "p" "nvPicPr" el
cNvPr <- findChildByName ns "p" "cNvPr" nvPicPr
let title = maybe "" id $ findAttr (unqual "name") cNvPr
alt = maybe "" id $ findAttr (unqual "descr") cNvPr
-- Get blip relationship ID
blipFill <- findChildByName ns "p" "blipFill" el
blip <- findChildByName ns "a" "blip" blipFill
relId <- findAttrByName ns "r" "embed" blip
return $ PptxPicture relId title alt
-- GraphicFrame: table or diagram
| isElem ns "p" "graphicFrame" el =
case findChildByName ns "a" "graphic" el >>=
findChildByName ns "a" "graphicData" of
Nothing -> Nothing
Just graphicData ->
case findAttr (unqual "uri") graphicData of
Nothing -> Just $ PptxGraphic "no-uri"
Just uri ->
if "table" `T.isInfixOf` uri
then
-- Table
case findChildByName ns "a" "tbl" graphicData of
Just tbl ->
let rows = parseTableRows ns tbl
in Just $ PptxTable rows
Nothing -> Nothing
else if "diagram" `T.isInfixOf` uri
then
-- SmartArt diagram - dgm namespace is declared inline on relIds element
let dgmRelIds = find (\e -> qName (elName e) == "relIds") (elChildren graphicData)
in case dgmRelIds of
Nothing -> Just $ PptxGraphic "diagram-no-relIds"
Just relIdsElem ->
-- Get r:dm and r:lo attributes (r namespace is in parent)
let ns' = ns <> elemToNameSpaces relIdsElem
in case (findAttrByName ns' "r" "dm" relIdsElem,
findAttrByName ns' "r" "lo" relIdsElem) of
(Just dataRelId, Just layoutRelId) ->
Just $ PptxDiagramRef dataRelId layoutRelId
_ -> Just $ PptxGraphic "diagram-missing-rels"
else
-- Other graphic (chart, etc.)
Just $ PptxGraphic ("other: " <> uri)
-- Skip other shapes for now
| otherwise = Nothing
-- | Parse table rows (simple text extraction)
parseTableRows :: NameSpaces -> Element -> [[Text]]
parseTableRows ns tblElem =
let trElems = findChildrenByName ns "a" "tr" tblElem
in map (parseTableRow ns) trElems
parseTableRow :: NameSpaces -> Element -> [Text]
parseTableRow ns trElem =
let tcElems = findChildrenByName ns "a" "tc" trElem
in map extractCellText tcElems
where
extractCellText tcElem =
-- Get text from txBody/a:p/a:r/a:t
case findChildByName ns "a" "txBody" tcElem of
Just txBody -> extractDrawingMLText txBody
Nothing -> ""
-- | Convert shape to Pandoc blocks
shapeToBlocks :: PandocMonad m => Archive -> [(Text, Text)] -> PptxShape -> m [Block]
shapeToBlocks _archive _rels (PptxTextBox paras) =
return $ paragraphsToBlocks paras
shapeToBlocks archive rels (PptxPicture relId title alt) = do
-- Resolve relationship to get media path
case lookup relId rels of
Nothing -> return [] -- Image not found
Just target -> do
let mediaPath = resolveMediaPath target
-- Load image bytes and add to MediaBag
case loadMediaFromArchive archive mediaPath of
Nothing -> return []
Just mediaBytes -> do
P.insertMedia (T.unpack mediaPath) Nothing mediaBytes
let altText = if T.null alt then [] else [Str alt]
return [Para [Image nullAttr altText (mediaPath, title)]]
shapeToBlocks _archive _rels (PptxTable rows) =
-- Simple table representation for now
case rows of
[] -> return []
(headerRow:bodyRows) -> do
let makeCell text = Cell nullAttr AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str text]]
headerCells = map makeCell headerRow
bodyCells = map (map makeCell) bodyRows
caption = Caption Nothing []
colSpec = replicate (length headerRow) (AlignDefault, ColWidthDefault)
headerRow' = Row nullAttr headerCells
bodyRows' = map (Row nullAttr) bodyCells
thead = TableHead nullAttr [headerRow']
tbody = [TableBody nullAttr 0 [] bodyRows']
tfoot = TableFoot nullAttr []
return [Table nullAttr caption colSpec thead tbody tfoot]
shapeToBlocks archive rels (PptxDiagramRef dataRelId layoutRelId) = do
-- Parse SmartArt diagram
case parseDiagram archive rels dataRelId layoutRelId of
Left err -> do
-- Failed to parse diagram, return placeholder
return [Para [Str $ "[Diagram parse error: " <> err <> "]"]]
Right diagram ->
return $ diagramToBlocks diagram
shapeToBlocks _archive _rels (PptxGraphic text) =
-- Placeholder for other graphics (charts, etc.)
return [Para [Str $ "[Graphic: " <> text <> "]"]]
-- | Resolve media path (handle relative paths)
resolveMediaPath :: Text -> Text
resolveMediaPath target =
if "../media/" `T.isPrefixOf` target
then "ppt/media/" <> T.drop 9 target -- "../media/" = 9 chars
else if "media/" `T.isPrefixOf` target
then "ppt/" <> target
else target
-- | Load media file from archive
loadMediaFromArchive :: Archive -> Text -> Maybe B.ByteString
loadMediaFromArchive archive path =
case findEntryByPath (T.unpack path) archive of
Just entry -> Just $ fromEntry entry
Nothing -> Nothing
-- | Parse paragraphs from text box
parseParagraphs :: NameSpaces -> Element -> [PptxParagraph]
parseParagraphs ns txBody =
let pElems = findChildrenByName ns "a" "p" txBody
in map (parseParagraph ns) pElems
-- | Parse individual paragraph
parseParagraph :: NameSpaces -> Element -> PptxParagraph
parseParagraph ns pElem =
let level = parseBulletLevel ns pElem
bullet = detectBulletType ns pElem
text = extractParagraphText ns pElem
in PptxParagraph level bullet text
-- | Parse bullet level from paragraph properties
parseBulletLevel :: NameSpaces -> Element -> Int
parseBulletLevel ns pElem =
case findChildByName ns "a" "pPr" pElem >>=
findAttr (unqual "lvl") >>=
(\s -> readMaybe (T.unpack s) :: Maybe Int) of
Just lvl -> lvl
Nothing -> 0 -- Default to level 0
-- | Detect bullet type
detectBulletType :: NameSpaces -> Element -> BulletType
detectBulletType ns pElem =
-- Check for explicit <a:pPr><a:buChar>
case findChildByName ns "a" "pPr" pElem >>=
findChildByName ns "a" "buChar" of
Just _buCharElem -> Bullet
Nothing ->
-- Check for Wingdings symbol (common in PowerPoint)
if hasWingdingsSymbol ns pElem
then WingdingsBullet
else NoBullet
-- | Check if paragraph starts with Wingdings symbol
hasWingdingsSymbol :: NameSpaces -> Element -> Bool
hasWingdingsSymbol ns pElem =
let runs = findChildrenByName ns "a" "r" pElem
checkRun r = case findChildByName ns "a" "rPr" r >>=
findChildByName ns "a" "sym" of
Just symElem ->
case findAttr (unqual "typeface") symElem of
Just typeface -> "Wingdings" `T.isInfixOf` typeface
Nothing -> False
Nothing -> False
in any checkRun runs
-- | Extract text from paragraph
extractParagraphText :: NameSpaces -> Element -> Text
extractParagraphText _ns pElem =
-- Find all <a:t> elements and concatenate
let textElems = filterElementsName (\qn -> qName qn == "t") pElem
texts = map strContent textElems
in T.unwords $ filter (not . T.null) texts
-- | Extract text from DrawingML element (finds all <a:t> descendants)
extractDrawingMLText :: Element -> Text
extractDrawingMLText el =
let textElems = filterElementsName (\qn -> qName qn == "t") el
texts = map strContent textElems
in T.unwords $ filter (not . T.null) texts
-- | Convert paragraphs to blocks, grouping bullets into lists
paragraphsToBlocks :: [PptxParagraph] -> [Block]
paragraphsToBlocks paras =
-- If we have multiple paragraphs with bullets, group them
let hasBullets = any (\p -> paraBullet p /= NoBullet) paras
in if hasBullets
then groupBulletParagraphs paras
else map (\p -> Para [Str $ paraText p]) paras
-- | Group bullet paragraphs into lists
groupBulletParagraphs :: [PptxParagraph] -> [Block]
groupBulletParagraphs paras =
let grouped = groupBy sameBulletLevel paras
in concatMap groupToBlock grouped
where
sameBulletLevel p1 p2 =
(paraBullet p1 /= NoBullet) &&
(paraBullet p2 /= NoBullet) &&
(paraLevel p1 == paraLevel p2)
groupToBlock :: [PptxParagraph] -> [Block]
groupToBlock [] = []
groupToBlock ps@(p:_)
| paraBullet p /= NoBullet =
-- Bullet list
let items = map (\para -> [Plain [Str $ paraText para]]) ps
in [BulletList items]
| otherwise =
-- Plain paragraph
map (\para -> Para [Str $ paraText para]) ps
-- | Check if shape is title placeholder (also used in Slides module)
isTitlePlaceholder :: NameSpaces -> Element -> Bool
isTitlePlaceholder ns el =
case findChildByName ns "p" "nvSpPr" el >>=
findChildByName ns "p" "nvPr" >>=
findChildByName ns "p" "ph" of
Just phElem ->
case findAttr (unqual "type") phElem of
Just phType -> phType == "title" || phType == "ctrTitle"
Nothing -> False
Nothing -> False
|