diff options
| author | John MacFarlane <[email protected]> | 2025-06-03 12:12:54 -0700 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-06-03 12:15:04 -0700 |
| commit | ccec1e0b65dad97765303e9640b40503961d32ca (patch) | |
| tree | 7d990241f8edf8129aede006a340696f44b08a51 /src | |
| parent | e007758c8c4e4b5b8c29e92731d98d0f7ff8a8cd (diff) | |
Docx reader: handle strict OpenXML as well as transitional.
Closes #7691.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 58 |
1 files changed, 39 insertions, 19 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 99ffcaf09..d0cf020d8 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -92,13 +92,13 @@ import Text.Pandoc.XML.Light filterChild, filterChildrenName, filterElementName, + filterElementsName, lookupAttrBy, parseXMLElement, elChildren, QName(QName, qName), Content(Elem), - Element(..), - findElements ) + Element(..)) data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -453,8 +453,10 @@ getDocumentXmlPath zf = do entry <- findEntryByPath "_rels/.rels" zf relsElem <- parseXMLFromEntry entry let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem - rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e == - Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") + rel <- find (\e -> + case findAttr (QName "Type" Nothing Nothing) e of + Just u -> isNamespace "officeDocument" "relationships/officeDocument" u + Nothing -> False) rels fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to @@ -1022,8 +1024,8 @@ elemToParPart' :: NameSpaces -> Element -> D [ParPart] elemToParPart' ns element | isElem ns "w" "r" element , Just drawingElem <- findChildByName ns "w" "drawing" element - , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" - , picElems <- findElements (QName "pic" (Just pic_ns) (Just "pic")) drawingElem + , picElems <- filterElementsName + (matchQName "drawingml" "picture" (Just "pic") "pic") drawingElem = let (title, alt) = getTitleAndAlt ns drawingElem drawings = map (\el -> ((findBlip el >>= findAttrByName ns "r" "embed"), el)) @@ -1057,15 +1059,15 @@ elemToParPart' ns element elemToParPart' ns element | isElem ns "w" "r" element , Just drawingElem <- findChildByName ns "w" "drawing" element - , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" - , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem + , Just _ <- filterElementName + (matchQName "drawingml" "diagram" (Just "dgm") "relIds") drawingElem = return [Diagram] -- Chart elemToParPart' ns element | isElem ns "w" "r" element , Just drawingElem <- findChildByName ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" - , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem + , Just _ <- filterElementName + (matchQName "drawingml" "chart" (Just "c") "chart") drawingElem = return [Chart] elemToParPart' ns element | isElem ns "w" "r" element = do @@ -1146,8 +1148,8 @@ elemToExtent el = childElemToRun :: NameSpaces -> Element -> D [Run] childElemToRun ns element | isElem ns "w" "drawing" element - , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" - , picElems <- findElements (QName "pic" (Just pic_ns) (Just "pic")) element + , picElems <- filterElementsName + (matchQName "drawingml" "picture" (Just "pic") "pic") element = let (title, alt) = getTitleAndAlt ns element drawings = map (\el -> ((findBlip el >>= findAttrByName ns "r" "embed"), el)) @@ -1161,13 +1163,13 @@ childElemToRun ns element drawings childElemToRun ns element | isElem ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" - , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) element + , Just _ <- filterElementName + (matchQName "drawingml" "chart" (Just "c") "chart") element = return [InlineChart] childElemToRun ns element | isElem ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" - , Just _ <- findElement (QName "relIds" (Just c_ns) (Just "dgm")) element + , Just _ <- filterElementName + (matchQName "drawingml" "diagram" (Just "dgm") "relIds") element = return [InlineDiagram] childElemToRun ns element | isElem ns "w" "footnoteReference" element @@ -1358,11 +1360,9 @@ setFont f s = s{envFont = f} findBlip :: Element -> Maybe Element findBlip el = do - blip <- findElement (QName "blip" (Just a_ns) (Just "a")) el + blip <- filterElementName (matchQName "drawingml" "main" (Just "a") "blip") el -- return svg if present: filterElementName (\(QName tag _ _) -> tag == "svgBlip") el `mplus` pure blip - where - a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" hasCaptionStyle :: ParagraphStyle -> Bool hasCaptionStyle parstyle = any (isCaptionStyleName . pStyleName) (pStyle parstyle) @@ -1387,3 +1387,23 @@ stripCaptionLabel els = (qName name == "instrText" && let ws = T.words (strContent el) in ("Table" `elem` ws || "Figure" `elem` ws)) + +isNamespace :: Text -> Text -> Text -> Bool +isNamespace primary secondary url = + -- first try transitional: + case T.stripPrefix "http://schemas.openxmlformats.org/" url of + Just path -> path == primary <> "/2006/" <> secondary + Nothing -> -- then try strict: + case T.stripPrefix "http://purl.oclc.org/ooxml/" url of + Just path -> path == primary <> "/" <> snakeToCamel secondary + Nothing -> False + where + snakeToCamel "custom-properties" = "customProperties" + snakeToCamel "extended-properties" = "extendedProperties" + snakeToCamel x = x + +matchQName :: Text -> Text -> Maybe Text -> Text -> QName -> Bool +matchQName primary secondary mbprefix name (QName name' mbns' mbprefix') = + name == name' && + (isNothing mbprefix || mbprefix' == mbprefix) && + maybe True (isNamespace primary secondary) mbns' |
