From d81ee85952400ef62b83146ed695a5ea113d120d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 13 Oct 2025 20:23:22 +0200 Subject: RTF reader: improve hyperlink parsing. Closes #11211. --- src/Text/Pandoc/Readers/RTF.hs | 60 ++++++++++++++++++++++++------------------ test/command/11211.md | 16 +++++++++++ 2 files changed, 50 insertions(+), 26 deletions(-) create mode 100644 test/command/11211.md diff --git a/src/Text/Pandoc/Readers/RTF.hs b/src/Text/Pandoc/Readers/RTF.hs index 52c549c75..b1195ac19 100644 --- a/src/Text/Pandoc/Readers/RTF.hs +++ b/src/Text/Pandoc/Readers/RTF.hs @@ -784,26 +784,40 @@ removeCommonFormatting = -- {\field{\*\fldinst{HYPERLINK "http://pandoc.org"}}{\fldrslt foo}} handleField :: PandocMonad m => Blocks -> [Tok] -> RTFParser m Blocks -handleField bs - (Tok _ - (Grouped - (Tok _ (ControlSymbol '*') - :Tok _ (ControlWord "fldinst" Nothing) - :Tok _ (Grouped (Tok _ (UnformattedText insttext):rest)) - :_)) - :linktoks) - | Just linkdest <- getHyperlink insttext - = do let linkdest' = case rest of - (Tok _ (ControlSymbol '\\') - : Tok _ (UnformattedText t) - : _) | Just bkmrk <- T.stripPrefix "l" t - -> "#" <> unquote bkmrk - _ -> linkdest - modifyGroup $ \g -> g{ gHyperlink = Just linkdest' } - result <- foldM processTok bs linktoks - modifyGroup $ \g -> g{ gHyperlink = Nothing } - return result -handleField bs _ = pure bs +handleField bs ts = do + let isFieldMod (Tok _ (ControlWord w _)) = + w `elem` ["flddirty", "fldedit", "fldlock", "fldpriv"] + isFieldMod _ = False + case dropWhile isFieldMod ts of + [Tok _ (Grouped + (Tok _ (ControlSymbol '*') + :Tok _ (ControlWord "fldinst" Nothing) + :Tok _ (Grouped instrtoks) + :_)), + Tok _ (Grouped + (Tok _ (ControlWord "fldrslt" Nothing) + :Tok _ (Grouped resulttoks) : _))] -> do + case getHyperlink instrtoks of + Just linkdest -> do + modifyGroup $ \g -> g{ gHyperlink = Just linkdest } + result <- foldM processTok bs resulttoks + modifyGroup $ \g -> g{ gHyperlink = Nothing } + return result + Nothing -> foldM processTok bs resulttoks + _ -> pure bs + +getHyperlink :: [Tok] -> Maybe Text +getHyperlink [] = Nothing +getHyperlink (Tok _ (UnformattedText w) : rest) + | Just w' <- unquote <$> T.stripPrefix "HYPERLINK" (T.strip w) + = if T.null w' + then case rest of + (Tok _ (ControlSymbol '\\') : Tok _ (UnformattedText b) : _) + | Just bkmrk <- unquote <$> T.stripPrefix "l " (T.strip b) + -> Just $ "#" <> bkmrk + _ -> Just mempty + else Just w' +getHyperlink (_:ts) = getHyperlink ts unquote :: Text -> Text unquote = T.dropWhile (=='"') . T.dropWhileEnd (=='"') . T.strip @@ -944,12 +958,6 @@ handlePict toks = do _ -> pict -getHyperlink :: Text -> Maybe Text -getHyperlink t = - case T.stripPrefix "HYPERLINK" (T.strip t) of - Nothing -> Nothing - Just rest -> Just $ unquote rest - processFontTable :: [Tok] -> FontTable processFontTable = snd . foldl' go (0, mempty) where diff --git a/test/command/11211.md b/test/command/11211.md new file mode 100644 index 000000000..0ecb8895a --- /dev/null +++ b/test/command/11211.md @@ -0,0 +1,16 @@ +``` +% pandoc -f rtf -t native +{\field{\*\fldinst{\rtlch\ab0\ai0\af2\alang1025\afs22\ltrch\b0\i0\fs22\lang1033\langnp1033\langfe1033\langfenp1033 +\loch\af2\dbch\af2\hich\f2\insrsid10976062\strike0\ulnone\cf1 HYPERLINK "https://example.com"}{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b0200000003000000e0c9ea79f9bace118c8200aa004ba90b28000000680074007400700073003a002f002f006500780061006d0070006c0065002e0063006f006d000000}} +{\fldrslt{\rtlch\ab0\ai0\af2\alang1025\afs22\ltrch\b0\i0\fs22\lang1033\langnp1033\langfe1033\langfenp1033\loch\af2\dbch\af2\hich\f2\strike0\ul\cf2 link}}} +^D +[ Para + [ Underline + [ Link + ( "" , [] , [] ) + [ Str "link" ] + ( "https://example.com" , "" ) + ] + ] +] +``` -- cgit v1.2.3