aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <[email protected]>2024-12-19 14:22:51 -0800
committerJohn MacFarlane <[email protected]>2024-12-19 14:22:51 -0800
commitcef339f78fc3fc8afd6456f61ecf2bfd9a53daa1 (patch)
treee55aaa03aefcd3614f69ed14d3bfc00a7e579737 /src
parent26b5c957db18639c0c96d26cf484ea1683597eb0 (diff)
T.P.mediaBag insertMedia: fast path for data URIs.
Avoid the slow URI parser from network-uri on large data URIs. See #10075. In a benchmark with a large base64 image in HTML -> docx, this patch causes us to go from 7942 GCs to 3654, and from 3781M in use to 1396M in use. (Note that before the last few commits, this was running 9099 GCs and 4350M in use.)
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/MediaBag.hs56
1 files changed, 33 insertions, 23 deletions
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs
index 1afc49a9b..678399703 100644
--- a/src/Text/Pandoc/MediaBag.hs
+++ b/src/Text/Pandoc/MediaBag.hs
@@ -58,6 +58,8 @@ instance Show MediaBag where
-- | We represent paths with /, in normalized form. Percent-encoding
-- is not resolved.
canonicalize :: FilePath -> Text
+-- avoid an expensive call to isURI for data URIs:
+canonicalize fp@('d':'a':'t':'a':':':_) = T.pack fp
canonicalize fp
| isURI fp = T.pack fp
| otherwise = T.replace "\\" "/" . T.pack . normalise $ fp
@@ -77,29 +79,37 @@ insertMedia :: FilePath -- ^ relative path and canonical name of resource
-> BL.ByteString -- ^ contents of resource
-> MediaBag
-> MediaBag
-insertMedia fp mbMime contents (MediaBag mediamap) =
- MediaBag (M.insert fp' mediaItem mediamap)
- where mediaItem = MediaItem{ mediaPath = newpath
- , mediaContents = contents
- , mediaMimeType = mt }
- fp' = canonicalize fp
- fp'' = unEscapeString $ T.unpack fp'
- uri = parseURI fp
- newpath = if Posix.isRelative fp''
- && Windows.isRelative fp''
- && isNothing uri
- && not (".." `isInfixOf` fp'')
- && '%' `notElem` fp''
- then fp''
- else show (hashWith SHA1 $ BL.toStrict contents) <> ext
- fallback = case takeExtension fp'' of
- ".gz" -> getMimeTypeDef $ dropExtension fp''
- _ -> getMimeTypeDef fp''
- mt = fromMaybe fallback mbMime
- path = maybe fp'' (unEscapeString . uriPath) uri
- ext = case takeExtension path of
- '.':e | '%' `notElem` e -> '.':e
- _ -> maybe "" (\x -> '.':T.unpack x) $ extensionFromMimeType mt
+insertMedia fp mbMime contents (MediaBag mediamap)
+ | 'd':'a':'t':'a':':':_ <- fp
+ , Just mt' <- mbMime
+ = MediaBag (M.insert fp'
+ MediaItem{ mediaPath = hashpath
+ , mediaContents = contents
+ , mediaMimeType = mt' } mediamap)
+ | otherwise = MediaBag (M.insert fp' mediaItem mediamap)
+ where
+ mediaItem = MediaItem{ mediaPath = newpath
+ , mediaContents = contents
+ , mediaMimeType = mt }
+ fp' = canonicalize fp
+ fp'' = unEscapeString $ T.unpack fp'
+ uri = parseURI fp
+ hashpath = show (hashWith SHA1 (BL.toStrict contents)) <> ext
+ newpath = if Posix.isRelative fp''
+ && Windows.isRelative fp''
+ && isNothing uri
+ && not (".." `isInfixOf` fp'')
+ && '%' `notElem` fp''
+ then fp''
+ else hashpath
+ fallback = case takeExtension fp'' of
+ ".gz" -> getMimeTypeDef $ dropExtension fp''
+ _ -> getMimeTypeDef fp''
+ mt = fromMaybe fallback mbMime
+ path = maybe fp'' (unEscapeString . uriPath) uri
+ ext = case takeExtension path of
+ '.':e | '%' `notElem` e -> '.':e
+ _ -> maybe "" (\x -> '.':T.unpack x) $ extensionFromMimeType mt
-- | Lookup a media item in a 'MediaBag', returning mime type and contents.
lookupMedia :: FilePath