From 0b140a9a988dd6291586753e23fd955c40ae881b Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 4 Aug 2025 13:04:09 -0700 Subject: ImageSize: Unpack compressed object streams in PDFs... ...and look inside for MediaBox information. Closes #10902. --- src/Text/Pandoc/ImageSize.hs | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index 55b0db63c..a939fdbae 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -57,6 +57,8 @@ import qualified Data.Attoparsec.ByteString as AW import qualified Data.Attoparsec.ByteString.Char8 as A import qualified Codec.Picture.Metadata as Metadata import Codec.Picture (decodeImageWithMetadata) +import Codec.Compression.Zlib (decompress) +-- import Debug.Trace -- quick and dirty functions to get image sizes -- algorithms borrowed from wwwis.pl @@ -300,10 +302,10 @@ pdfSize img = Right sz -> Just sz pPdfSize :: A.Parser ImageSize -pPdfSize = do - A.skipWhile (/='/') - A.char8 '/' - (do A.string "MediaBox" +pPdfSize = + (A.takeWhile1 (/= '/') *> pPdfSize) + <|> + (do A.string "/MediaBox" A.skipSpace A.char8 '[' A.skipSpace @@ -320,7 +322,27 @@ pPdfSize = do , pxY = y2 - y1 , dpiX = 72 , dpiY = 72 } - ) <|> pPdfSize + ) + <|> -- if we encounter a compressed object stream, uncompress it (#10902) + (do A.string "/Type" + A.skipSpace + A.string "/ObjStm" + _ <- A.manyTill pLine (A.string "stream" *> pEol) + stream <- BL.pack <$> A.manyTill + (AW.satisfy (const True)) + (pEol *> A.string "endstream" *> pEol) + let contents = BL.toStrict (decompress stream) + case A.parseOnly pPdfSize contents of + Left _ -> pPdfSize + Right is -> pure is) + <|> + (A.char '/' *> pPdfSize) + where + iseol '\r' = True + iseol '\n' = True + iseol _ = False + pEol = A.satisfy iseol *> A.skipMany (A.satisfy iseol) + pLine = A.takeWhile (not . iseol) <* pEol getSize :: ByteString -> Either T.Text ImageSize getSize img = -- cgit v1.2.3