diff options
| author | John MacFarlane <[email protected]> | 2024-12-19 14:22:51 -0800 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2024-12-19 14:22:51 -0800 |
| commit | cef339f78fc3fc8afd6456f61ecf2bfd9a53daa1 (patch) | |
| tree | e55aaa03aefcd3614f69ed14d3bfc00a7e579737 | |
| parent | 26b5c957db18639c0c96d26cf484ea1683597eb0 (diff) | |
T.P.mediaBag insertMedia: fast path for data URIs.
Avoid the slow URI parser from network-uri on large data URIs.
See #10075. In a benchmark with a large base64 image in HTML ->
docx, this patch causes us to go from 7942 GCs to 3654, and from
3781M in use to 1396M in use.
(Note that before the last few commits, this was running 9099 GCs
and 4350M in use.)
| -rw-r--r-- | src/Text/Pandoc/MediaBag.hs | 56 |
1 files changed, 33 insertions, 23 deletions
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs index 1afc49a9b..678399703 100644 --- a/src/Text/Pandoc/MediaBag.hs +++ b/src/Text/Pandoc/MediaBag.hs @@ -58,6 +58,8 @@ instance Show MediaBag where -- | We represent paths with /, in normalized form. Percent-encoding -- is not resolved. canonicalize :: FilePath -> Text +-- avoid an expensive call to isURI for data URIs: +canonicalize fp@('d':'a':'t':'a':':':_) = T.pack fp canonicalize fp | isURI fp = T.pack fp | otherwise = T.replace "\\" "/" . T.pack . normalise $ fp @@ -77,29 +79,37 @@ insertMedia :: FilePath -- ^ relative path and canonical name of resource -> BL.ByteString -- ^ contents of resource -> MediaBag -> MediaBag -insertMedia fp mbMime contents (MediaBag mediamap) = - MediaBag (M.insert fp' mediaItem mediamap) - where mediaItem = MediaItem{ mediaPath = newpath - , mediaContents = contents - , mediaMimeType = mt } - fp' = canonicalize fp - fp'' = unEscapeString $ T.unpack fp' - uri = parseURI fp - newpath = if Posix.isRelative fp'' - && Windows.isRelative fp'' - && isNothing uri - && not (".." `isInfixOf` fp'') - && '%' `notElem` fp'' - then fp'' - else show (hashWith SHA1 $ BL.toStrict contents) <> ext - fallback = case takeExtension fp'' of - ".gz" -> getMimeTypeDef $ dropExtension fp'' - _ -> getMimeTypeDef fp'' - mt = fromMaybe fallback mbMime - path = maybe fp'' (unEscapeString . uriPath) uri - ext = case takeExtension path of - '.':e | '%' `notElem` e -> '.':e - _ -> maybe "" (\x -> '.':T.unpack x) $ extensionFromMimeType mt +insertMedia fp mbMime contents (MediaBag mediamap) + | 'd':'a':'t':'a':':':_ <- fp + , Just mt' <- mbMime + = MediaBag (M.insert fp' + MediaItem{ mediaPath = hashpath + , mediaContents = contents + , mediaMimeType = mt' } mediamap) + | otherwise = MediaBag (M.insert fp' mediaItem mediamap) + where + mediaItem = MediaItem{ mediaPath = newpath + , mediaContents = contents + , mediaMimeType = mt } + fp' = canonicalize fp + fp'' = unEscapeString $ T.unpack fp' + uri = parseURI fp + hashpath = show (hashWith SHA1 (BL.toStrict contents)) <> ext + newpath = if Posix.isRelative fp'' + && Windows.isRelative fp'' + && isNothing uri + && not (".." `isInfixOf` fp'') + && '%' `notElem` fp'' + then fp'' + else hashpath + fallback = case takeExtension fp'' of + ".gz" -> getMimeTypeDef $ dropExtension fp'' + _ -> getMimeTypeDef fp'' + mt = fromMaybe fallback mbMime + path = maybe fp'' (unEscapeString . uriPath) uri + ext = case takeExtension path of + '.':e | '%' `notElem` e -> '.':e + _ -> maybe "" (\x -> '.':T.unpack x) $ extensionFromMimeType mt -- | Lookup a media item in a 'MediaBag', returning mime type and contents. lookupMedia :: FilePath |
