diff options
| author | John MacFarlane <[email protected]> | 2024-06-12 22:11:21 -0700 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2024-06-12 22:31:33 -0700 |
| commit | 94975a4e3c50db1c02b373c8196fe77af0522a01 (patch) | |
| tree | 2fc136e6ebf8e01599ab41f75d4edcb19b9b15b4 /src/Text | |
| parent | b2f0d8e7992d9f3e0d655a49a89f77c48a802c1c (diff) | |
Docx reader: improve handling of captions.
- Turn captioned images into Figure elements. Closes #9391.
- Improve the logic for associating elements with captions.
Closes #9358.
- Ensure that captions that can't be associated with an
element aren't just silently dropped. Closes #9610.
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 32 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 76 |
2 files changed, 67 insertions, 41 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index a1112c53d..3cc1860eb 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -78,7 +78,7 @@ import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (catMaybes, isJust, fromMaybe, mapMaybe) +import Data.Maybe (isJust, fromMaybe, mapMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -132,7 +132,6 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines - , docxTableCaptions :: [Blocks] , docxReferences :: M.Map ItemId (Reference Inlines) } @@ -145,7 +144,6 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty - , docxTableCaptions = [] , docxReferences = mempty } @@ -664,11 +662,6 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c -bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) -bodyPartToTableCaption (Capt pPr parparts) = - Just <$> bodyPartToBlocks (Paragraph pPr parparts) -bodyPartToTableCaption _ = pure Nothing - bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -767,17 +760,22 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts -bodyPartToBlocks (Capt _ _) = - return mempty +bodyPartToBlocks (Captioned parstyle parparts bpart) = do + bs <- bodyPartToBlocks bpart + captContents <- bodyPartToBlocks (Paragraph parstyle parparts) + let capt = Caption Nothing (toList captContents) + case toList bs of + [Table attr _cap colspecs thead tbodies tfoot] + -> pure $ singleton $ Table attr capt colspecs thead tbodies tfoot + [Figure attr _cap blks] + -> pure $ singleton $ Figure attr capt blks + [Para im@[Image{}]] + -> pure $ singleton $ Figure nullAttr capt [Plain im] + _ -> pure captContents bodyPartToBlocks (Tbl _ _ _ []) = return mempty bodyPartToBlocks (Tbl cap grid look parts) = do - captions <- gets docxTableCaptions - fullCaption <- case captions of - c : cs -> do - modify (\s -> s { docxTableCaptions = cs }) - return c - [] -> return $ if T.null cap then mempty else plain (text cap) + let fullCaption = if T.null cap then mempty else plain (text cap) let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) cap' = caption shortCaption fullCaption (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts @@ -840,11 +838,9 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps - captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps let isNumberedPara (Paragraph pPr _) = numbered pPr isNumberedPara _ = False modify (\s -> s { docxNumberedHeadings = any isNumberedPara blkbps }) - modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 16f7234d5..e5bd0bc20 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} @@ -267,6 +266,7 @@ data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle] , dropCap :: Bool , pChange :: Maybe TrackedChange , pBidi :: Maybe Bool + , pKeepNext :: Bool } deriving Show @@ -278,13 +278,14 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] , dropCap = False , pChange = Nothing , pBidi = Just False + , pKeepNext = False } data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] - | Capt ParagraphStyle [ParPart] + | Captioned ParagraphStyle [ParPart] BodyPart | HRule deriving Show @@ -472,7 +473,7 @@ archiveToDocument zf = do elemToBody :: NameSpaces -> Element -> D Body elemToBody ns element | isElem ns "w" "body" element = - fmap Body (mapD (elemToBodyPart ns) (elChildren element)) + Body . addCaptioned <$> mapD (elemToBodyPart ns) (elChildren element) elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) @@ -761,6 +762,24 @@ mkListItem parstyle numId lvl parparts = do pStyleIndentation :: ParagraphStyle -> Maybe ParIndentation pStyleIndentation style = (getParStyleField indent . pStyle) style +addCaptioned :: [BodyPart] -> [BodyPart] +addCaptioned [] = [] +addCaptioned (Paragraph parstyle parparts : x : xs) + | hasCaptionStyle parstyle + , isCaptionable x + = Captioned parstyle parparts x : addCaptioned xs +addCaptioned (x : Paragraph parstyle parparts : xs) + | hasCaptionStyle parstyle + , not (pKeepNext parstyle) + , isCaptionable x + = Captioned parstyle parparts x : addCaptioned xs +addCaptioned (x:xs) = x : addCaptioned xs + +isCaptionable :: BodyPart -> Bool +isCaptionable (Paragraph _ [Drawing{}]) = True +isCaptionable (Tbl{}) = True +isCaptionable _ = False + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "m" "oMathPara" element = do @@ -803,35 +822,21 @@ elemToBodyPart ns element <$> asks envParStyles <*> asks envNumbering - let hasCaptionStyle = - any ((== "caption") . pStyleName) (pStyle parstyle) - - let isTableNumberElt el@(Element name attribs _ _) = - (qName name == "fldSimple" && - case lookupAttrBy ((== "instr") . qName) attribs of - Nothing -> False - Just instr -> "Table" `elem` T.words instr) || - (qName name == "instrText" && "Table" `elem` T.words (strContent el)) - - let isTable = hasCaptionStyle && - isJust (filterChild isTableNumberElt element) + let children = + (if hasCaptionStyle parstyle + then stripCaptionLabel + else id) (elChildren element) - let stripOffLabel = dropWhile (not . isTableNumberElt) - - let children = (if isTable - then stripOffLabel - else id) $ elChildren element parparts' <- mconcat <$> mapD (elemToParPart ns) children fldCharState <- gets stateFldCharState modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState} -- Word uses list enumeration for numbered headings, so we only -- want to infer a list from the styles if it is NOT a heading. - let parparts = parparts' ++ (openFldCharsToParParts fldCharState) + let parparts = parparts' ++ openFldCharsToParParts fldCharState case pHeading parstyle of Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do mkListItem parstyle numId lvl parparts - _ -> return $ (if hasCaptionStyle then Capt else Paragraph) - parstyle parparts + _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do @@ -1259,6 +1264,7 @@ elemToParagraphStyle ns element sty numbering ) >>= getTrackedChange ns , pBidi = checkOnOff ns pPr (elemName ns "w" "bidi") + , pKeepNext = isJust $ findChildByName ns "w" "keepNext" pPr } | otherwise = defaultParagraphStyle @@ -1335,3 +1341,27 @@ findBlip el = do filterElementName (\(QName tag _ _) -> tag == "svgBlip") el `mplus` pure blip where a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + +hasCaptionStyle :: ParagraphStyle -> Bool +hasCaptionStyle parstyle = any (isCaptionStyleName . pStyleName) (pStyle parstyle) + where -- note that these are case insensitive: + isCaptionStyleName "caption" = True + isCaptionStyleName "table caption" = True + isCaptionStyleName "image caption" = True + isCaptionStyleName _ = False + +stripCaptionLabel :: [Element] -> [Element] +stripCaptionLabel els = + if any isNumberElt els + then dropWhile (not . isNumberElt) els + else els + where + isNumberElt el@(Element name attribs _ _) = + (qName name == "fldSimple" && + case lookupAttrBy ((== "instr") . qName) attribs of + Nothing -> False + Just instr -> "Table" `elem` T.words instr || + "Figure" `elem` T.words instr) || + (qName name == "instrText" && + let ws = T.words (strContent el) + in ("Table" `elem` ws || "Figure" `elem` ws)) |
