diff options
| author | Repetitive <[email protected]> | 2025-08-27 10:44:46 +0200 |
|---|---|---|
| committer | John MacFarlane <[email protected]> | 2025-08-28 21:15:36 +0200 |
| commit | 729c12394cb69ecc2d8c8d6177d2cc5526e51a13 (patch) | |
| tree | ef037dc9086d875a025949bd06463a620fced01a /src | |
| parent | 1bf2564b5cd363c0b5c92b73a6912de98782a883 (diff) | |
fix: recognize binary signatures and fail early
Fail early when receiving binary input with recognized signature:
- zip[-based]: including OpenDocument and Microsoft formats
- PDF
- CFBF-based: old Microsoft formats including .doc and .xls
- DjVu
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/App/Input.hs | 22 | ||||
| -rw-r--r-- | src/Text/Pandoc/Error.hs | 9 | ||||
| -rw-r--r-- | src/Text/Pandoc/Format.hs | 1 |
3 files changed, 28 insertions, 4 deletions
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs index ecb423634..87e71da6b 100644 --- a/src/Text/Pandoc/App/Input.hs +++ b/src/Text/Pandoc/App/Input.hs @@ -14,7 +14,7 @@ module Text.Pandoc.App.Input , readInput ) where -import Control.Monad ((>=>)) +import Control.Monad ((>=>), when) import Control.Monad.Except (throwError, catchError) import Data.Text (Text) import Network.URI (URI (..), parseURI) @@ -109,15 +109,29 @@ inputToText convTabs (fp, (bs,mt)) = (toTextM fp bs) (\case PandocUTF8DecodingError{} -> do - -- TODO check for binary file signatures - -- here and exit with an error instead - -- of treating as latin1e.. + when (hasKnownSignature bs) $ + throwError $ + PandocInputNotTextError (T.pack fp) report $ NotUTF8Encoded (if null fp then "input" else fp) return $ T.pack $ B8.unpack bs e -> throwError e) + where + -- "50 4B 03 04" is zip file signature + isZip bs' = "\x50\x4B\x03\x04" `BS.isPrefixOf` bs' + -- "25 50 44 46 2D" is PDF file signature + isPDF bs' = "\x25\x50\x44\x46\x2D" `BS.isPrefixOf` bs' + -- "D0 CF 11 E0 A1 B1 1A E1" is Compound File Binary Format signature used in + -- variety of old Microsoft formats (.doc and .xls among others) + isCFBF bs' = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" `BS.isPrefixOf` bs' + -- "41 54 26 54 46 4F 52 4D ?? ?? ?? ?? 44 4A 56" is DjVu signature + isDjVu bs' = case BS.stripPrefix "\x41\x54\x26\x54\x46\x4F\x52\x4D" bs' of + Nothing -> False + Just x -> BS.isPrefixOf "\x44\x4A\x56" $ BS.drop 4 x + + hasKnownSignature bs' = any ($ bs') [isZip, isPDF, isCFBF, isDjVu] inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType)) -> BL.ByteString diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs index f9c9aea8d..e782df735 100644 --- a/src/Text/Pandoc/Error.hs +++ b/src/Text/Pandoc/Error.hs @@ -65,6 +65,7 @@ data PandocError = PandocIOError Text IOError | PandocUnsupportedExtensionError Text Text | PandocCiteprocError CiteprocError | PandocBibliographyError Text Text + | PandocInputNotTextError Text deriving (Show, Typeable, Generic) instance Exception PandocError @@ -143,6 +144,13 @@ renderError e = prettyCiteprocError e' PandocBibliographyError fp msg -> "Error reading bibliography file " <> fp <> ":\n" <> msg + PandocInputNotTextError fp -> + "Expected text as an input, but received binary data from " <> + (if T.null fp + then "stdin" + else "file " <> fp) <> + ".\nIf you intended to convert from binary format, verify that it's " <> + "supported and use\nexplicit -f FORMAT." -- | Handle PandocError by exiting with an error message. @@ -184,6 +192,7 @@ handleError (Left e) = PandocUTF8DecodingError{} -> 92 PandocIpynbDecodingError{} -> 93 PandocUnsupportedCharsetError{} -> 94 + PandocInputNotTextError{} -> 95 PandocCouldNotFindDataFileError{} -> 97 PandocCouldNotFindMetadataFileError{} -> 98 PandocResourceNotFound{} -> 99 diff --git a/src/Text/Pandoc/Format.hs b/src/Text/Pandoc/Format.hs index 433592531..5b87a2531 100644 --- a/src/Text/Pandoc/Format.hs +++ b/src/Text/Pandoc/Format.hs @@ -181,6 +181,7 @@ formatFromFilePath x = ".ctx" -> defFlavor "context" ".db" -> defFlavor "docbook" ".dj" -> defFlavor "djot" + ".djvu" -> defFlavor "djvu" -- so we get an "unknown reader" error ".doc" -> defFlavor "doc" -- so we get an "unknown reader" error ".docx" -> defFlavor "docx" ".dokuwiki" -> defFlavor "dokuwiki" |
