aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2024-06-12 22:11:21 -0700
committerJohn MacFarlane <[email protected]>2024-06-12 22:31:33 -0700
commit94975a4e3c50db1c02b373c8196fe77af0522a01 (patch)
tree2fc136e6ebf8e01599ab41f75d4edcb19b9b15b4 /src/Text
parentb2f0d8e7992d9f3e0d655a49a89f77c48a802c1c (diff)
Docx reader: improve handling of captions.
- Turn captioned images into Figure elements. Closes #9391. - Improve the logic for associating elements with captions. Closes #9358. - Ensure that captions that can't be associated with an element aren't just silently dropped. Closes #9610.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs32
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs76
2 files changed, 67 insertions, 41 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index a1112c53d..3cc1860eb 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -78,7 +78,7 @@ import Data.List (delete, intersect, foldl')
import Data.Char (isSpace)
import qualified Data.Map as M
import qualified Data.Text as T
-import Data.Maybe (catMaybes, isJust, fromMaybe, mapMaybe)
+import Data.Maybe (isJust, fromMaybe, mapMaybe)
import Data.Sequence (ViewL (..), viewl)
import qualified Data.Sequence as Seq
import qualified Data.Set as Set
@@ -132,7 +132,6 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text
-- restarting
, docxListState :: M.Map (T.Text, T.Text) Integer
, docxPrevPara :: Inlines
- , docxTableCaptions :: [Blocks]
, docxReferences :: M.Map ItemId (Reference Inlines)
}
@@ -145,7 +144,6 @@ instance Default DState where
, docxDropCap = mempty
, docxListState = M.empty
, docxPrevPara = mempty
- , docxTableCaptions = []
, docxReferences = mempty
}
@@ -664,11 +662,6 @@ normalizeToClassName = T.map go . fromStyleName
where go c | isSpace c = '-'
| otherwise = c
-bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks)
-bodyPartToTableCaption (Capt pPr parparts) =
- Just <$> bodyPartToBlocks (Paragraph pPr parparts)
-bodyPartToTableCaption _ = pure Nothing
-
bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
| Just True <- pBidi pPr = do
@@ -767,17 +760,22 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr}
in
bodyPartToBlocks $ Paragraph pPr' parparts
-bodyPartToBlocks (Capt _ _) =
- return mempty
+bodyPartToBlocks (Captioned parstyle parparts bpart) = do
+ bs <- bodyPartToBlocks bpart
+ captContents <- bodyPartToBlocks (Paragraph parstyle parparts)
+ let capt = Caption Nothing (toList captContents)
+ case toList bs of
+ [Table attr _cap colspecs thead tbodies tfoot]
+ -> pure $ singleton $ Table attr capt colspecs thead tbodies tfoot
+ [Figure attr _cap blks]
+ -> pure $ singleton $ Figure attr capt blks
+ [Para im@[Image{}]]
+ -> pure $ singleton $ Figure nullAttr capt [Plain im]
+ _ -> pure captContents
bodyPartToBlocks (Tbl _ _ _ []) =
return mempty
bodyPartToBlocks (Tbl cap grid look parts) = do
- captions <- gets docxTableCaptions
- fullCaption <- case captions of
- c : cs -> do
- modify (\s -> s { docxTableCaptions = cs })
- return c
- [] -> return $ if T.null cap then mempty else plain (text cap)
+ let fullCaption = if T.null cap then mempty else plain (text cap)
let shortCaption = if T.null cap then Nothing else Just (toList (text cap))
cap' = caption shortCaption fullCaption
(hdr, rows) = splitHeaderRows (firstRowFormatting look) parts
@@ -840,11 +838,9 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
- captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps
let isNumberedPara (Paragraph pPr _) = numbered pPr
isNumberedPara _ = False
modify (\s -> s { docxNumberedHeadings = any isNumberedPara blkbps })
- modify (\s -> s { docxTableCaptions = captions })
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
blks'' <- removeOrphanAnchors blks'
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 16f7234d5..e5bd0bc20 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -1,4 +1,3 @@
-{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
@@ -267,6 +266,7 @@ data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle]
, dropCap :: Bool
, pChange :: Maybe TrackedChange
, pBidi :: Maybe Bool
+ , pKeepNext :: Bool
}
deriving Show
@@ -278,13 +278,14 @@ defaultParagraphStyle = ParagraphStyle { pStyle = []
, dropCap = False
, pChange = Nothing
, pBidi = Just False
+ , pKeepNext = False
}
data BodyPart = Paragraph ParagraphStyle [ParPart]
| ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart]
| Tbl T.Text TblGrid TblLook [Row]
- | Capt ParagraphStyle [ParPart]
+ | Captioned ParagraphStyle [ParPart] BodyPart
| HRule
deriving Show
@@ -472,7 +473,7 @@ archiveToDocument zf = do
elemToBody :: NameSpaces -> Element -> D Body
elemToBody ns element | isElem ns "w" "body" element =
- fmap Body (mapD (elemToBodyPart ns) (elChildren element))
+ Body . addCaptioned <$> mapD (elemToBodyPart ns) (elChildren element)
elemToBody _ _ = throwError WrongElem
archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap)
@@ -761,6 +762,24 @@ mkListItem parstyle numId lvl parparts = do
pStyleIndentation :: ParagraphStyle -> Maybe ParIndentation
pStyleIndentation style = (getParStyleField indent . pStyle) style
+addCaptioned :: [BodyPart] -> [BodyPart]
+addCaptioned [] = []
+addCaptioned (Paragraph parstyle parparts : x : xs)
+ | hasCaptionStyle parstyle
+ , isCaptionable x
+ = Captioned parstyle parparts x : addCaptioned xs
+addCaptioned (x : Paragraph parstyle parparts : xs)
+ | hasCaptionStyle parstyle
+ , not (pKeepNext parstyle)
+ , isCaptionable x
+ = Captioned parstyle parparts x : addCaptioned xs
+addCaptioned (x:xs) = x : addCaptioned xs
+
+isCaptionable :: BodyPart -> Bool
+isCaptionable (Paragraph _ [Drawing{}]) = True
+isCaptionable (Tbl{}) = True
+isCaptionable _ = False
+
elemToBodyPart :: NameSpaces -> Element -> D BodyPart
elemToBodyPart ns element
| isElem ns "m" "oMathPara" element = do
@@ -803,35 +822,21 @@ elemToBodyPart ns element
<$> asks envParStyles
<*> asks envNumbering
- let hasCaptionStyle =
- any ((== "caption") . pStyleName) (pStyle parstyle)
-
- let isTableNumberElt el@(Element name attribs _ _) =
- (qName name == "fldSimple" &&
- case lookupAttrBy ((== "instr") . qName) attribs of
- Nothing -> False
- Just instr -> "Table" `elem` T.words instr) ||
- (qName name == "instrText" && "Table" `elem` T.words (strContent el))
-
- let isTable = hasCaptionStyle &&
- isJust (filterChild isTableNumberElt element)
+ let children =
+ (if hasCaptionStyle parstyle
+ then stripCaptionLabel
+ else id) (elChildren element)
- let stripOffLabel = dropWhile (not . isTableNumberElt)
-
- let children = (if isTable
- then stripOffLabel
- else id) $ elChildren element
parparts' <- mconcat <$> mapD (elemToParPart ns) children
fldCharState <- gets stateFldCharState
modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState}
-- Word uses list enumeration for numbered headings, so we only
-- want to infer a list from the styles if it is NOT a heading.
- let parparts = parparts' ++ (openFldCharsToParParts fldCharState)
+ let parparts = parparts' ++ openFldCharsToParParts fldCharState
case pHeading parstyle of
Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
mkListItem parstyle numId lvl parparts
- _ -> return $ (if hasCaptionStyle then Capt else Paragraph)
- parstyle parparts
+ _ -> return $ Paragraph parstyle parparts
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
@@ -1259,6 +1264,7 @@ elemToParagraphStyle ns element sty numbering
) >>=
getTrackedChange ns
, pBidi = checkOnOff ns pPr (elemName ns "w" "bidi")
+ , pKeepNext = isJust $ findChildByName ns "w" "keepNext" pPr
}
| otherwise = defaultParagraphStyle
@@ -1335,3 +1341,27 @@ findBlip el = do
filterElementName (\(QName tag _ _) -> tag == "svgBlip") el `mplus` pure blip
where
a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+
+hasCaptionStyle :: ParagraphStyle -> Bool
+hasCaptionStyle parstyle = any (isCaptionStyleName . pStyleName) (pStyle parstyle)
+ where -- note that these are case insensitive:
+ isCaptionStyleName "caption" = True
+ isCaptionStyleName "table caption" = True
+ isCaptionStyleName "image caption" = True
+ isCaptionStyleName _ = False
+
+stripCaptionLabel :: [Element] -> [Element]
+stripCaptionLabel els =
+ if any isNumberElt els
+ then dropWhile (not . isNumberElt) els
+ else els
+ where
+ isNumberElt el@(Element name attribs _ _) =
+ (qName name == "fldSimple" &&
+ case lookupAttrBy ((== "instr") . qName) attribs of
+ Nothing -> False
+ Just instr -> "Table" `elem` T.words instr ||
+ "Figure" `elem` T.words instr) ||
+ (qName name == "instrText" &&
+ let ws = T.words (strContent el)
+ in ("Table" `elem` ws || "Figure" `elem` ws))